From 21e7fff8bb6b3dcad9a2e5f7873f350b0994e945 Mon Sep 17 00:00:00 2001
From: kingluo <home_king@163.com>
Date: Thu, 9 May 2024 18:21:33 +0800
Subject: [PATCH 01/25] feat(1808): migrate to Linux 6.8.9

close #1808
---
 Makefile                   |  2 +-
 fw/cache.c                 | 12 +++++------
 fw/http.c                  |  4 ++--
 fw/http.h                  |  2 +-
 fw/http_parser.c           |  2 +-
 fw/http_sched_hash.c       |  4 ++--
 fw/htype.h                 |  2 ++
 fw/main.c                  |  6 +++---
 fw/procfs.c                |  6 +++---
 fw/sock.c                  | 25 +++++++++++-----------
 fw/sock_clnt.c             |  2 +-
 fw/ss_skb.c                | 10 ++++-----
 fw/str_avx2.S              |  3 +--
 fw/t/Makefile              |  4 ++--
 ktest/crypto/sha512_base.h | 43 ++++++++++++++++++++++++++++++++++++++
 ktest/linux/kernel.h       |  6 ++++++
 lib/str_simd.S             |  2 --
 tls/bignum_x86-64.S        |  1 -
 tls/crypto.h               |  2 +-
 tls/ecdh.c                 |  1 +
 tls/tls_internal.h         |  4 ++--
 21 files changed, 94 insertions(+), 49 deletions(-)
 create mode 100644 ktest/crypto/sha512_base.h

diff --git a/Makefile b/Makefile
index b104661b2..b861a6b61 100644
--- a/Makefile
+++ b/Makefile
@@ -24,7 +24,7 @@ else
 test-gt = $(shell test $(strip $1)0 -gt $(strip $2)0 && echo y)
 endif
 
-TFW_CFLAGS = $(DEFINES) -Werror
+TFW_CFLAGS = $(DEFINES) -Werror -Wno-missing-prototypes -Wno-error=missing-declarations -Wno-missing-declarations
 ifdef DEBUG
 	ifeq ($(call test-gt, 1, $(DEBUG)), y)
 		ERROR = "DEBUG must be greater than 0"
diff --git a/fw/cache.c b/fw/cache.c
index 6918ab55a..de1a652ff 100644
--- a/fw/cache.c
+++ b/fw/cache.c
@@ -2597,13 +2597,13 @@ tfw_cache_add_body_page(TfwMsgIter *it, char *p, int sz, bool h2,
  *
  * Different strategies are used to avoid extra data copying depending on
  * client connection type:
- * - for http connections - pages are reused in skbs and SKBTX_SHARED_FRAG is
+ * - for http connections - pages are reused in skbs and SKBFL_SHARED_FRAG is
  * set to avoid any data copies.
- * - for https connections - pages are reused in skbs and SKBTX_SHARED_FRAG is
+ * - for https connections - pages are reused in skbs and SKBFL_SHARED_FRAG is
  * set, but in-place crypto operations are not allowed, so data copy happens
  * right before data is pushed into network.
  * - for h2 connections - every response has unique frame header, so need to
- * copy on constructing response body from cache. SKBTX_SHARED_FRAG is left
+ * copy on constructing response body from cache. SKBFL_SHARED_FRAG is left
  * unset to allow in-place crypto operations.
  *
  * Since we can't encrypt shared data in-place we always copy it, so we need
@@ -2629,14 +2629,14 @@ tfw_cache_build_resp_body(TDB *db, TdbVRec *trec, TfwMsgIter *it, char *p,
 	 * TX flags for headers and body differ.
 	 */
 	if (!it->skb || (it->frag + 1 >= MAX_SKB_FRAGS)
-	    || (sh_frag == !(skb_shinfo(it->skb)->tx_flags & SKBTX_SHARED_FRAG)))
+	    || (sh_frag == !(skb_shinfo(it->skb)->tx_flags & SKBFL_SHARED_FRAG)))
 	{
 		if  ((r = tfw_msg_iter_append_skb(it)))
 			return r;
 		if (!sh_frag)
-			skb_shinfo(it->skb)->tx_flags &= ~SKBTX_SHARED_FRAG;
+			skb_shinfo(it->skb)->tx_flags &= ~SKBFL_SHARED_FRAG;
 		else
-			skb_shinfo(it->skb)->tx_flags |= SKBTX_SHARED_FRAG;
+			skb_shinfo(it->skb)->tx_flags |= SKBFL_SHARED_FRAG;
 	}
 
 	while (1) {
diff --git a/fw/http.c b/fw/http.c
index 0eb244044..0a9637874 100644
--- a/fw/http.c
+++ b/fw/http.c
@@ -4821,11 +4821,11 @@ tfw_h2_append_predefined_body(TfwHttpResp *resp, unsigned int stream_id,
 	it->frag = skb_shinfo(it->skb)->nr_frags - 1;
 
 	if ((it->frag + 1 >= MAX_SKB_FRAGS)
-	    || (skb_shinfo(it->skb)->tx_flags & SKBTX_SHARED_FRAG))
+	    || (skb_shinfo(it->skb)->tx_flags & SKBFL_SHARED_FRAG))
 	{
 		if  ((r = tfw_msg_iter_append_skb(it)))
 			return r;
-		skb_shinfo(it->skb)->tx_flags &= ~SKBTX_SHARED_FRAG;
+		skb_shinfo(it->skb)->tx_flags &= ~SKBFL_SHARED_FRAG;
 	}
 
 	data = body->data;
diff --git a/fw/http.h b/fw/http.h
index 00d362793..4959c8bc3 100644
--- a/fw/http.h
+++ b/fw/http.h
@@ -21,7 +21,7 @@
 #ifndef __TFW_HTTP_H__
 #define __TFW_HTTP_H__
 
-#include <crypto/sha.h>
+#include <crypto/sha512_base.h>
 
 #include "http_types.h"
 #include "connection.h"
diff --git a/fw/http_parser.c b/fw/http_parser.c
index 2ed3f73dc..f2a35c9c2 100644
--- a/fw/http_parser.c
+++ b/fw/http_parser.c
@@ -11641,7 +11641,7 @@ tfw_http_parse_resp(void *resp_data, unsigned char *data, unsigned int len,
 				tfw_http_msg_hdr_open(msg, p);
 				__FSM_MOVE_n(Resp_StatusCodeBeg, 9);
 			}
-			/* fall through */
+			fallthrough;
 		default:
 			TFW_PARSER_DROP(Resp_HttpVer);
 		}
diff --git a/fw/http_sched_hash.c b/fw/http_sched_hash.c
index b97ec8439..a0a42e7da 100644
--- a/fw/http_sched_hash.c
+++ b/fw/http_sched_hash.c
@@ -355,7 +355,7 @@ tfw_sched_hash_add_grp(TfwSrvGroup *sg, void *data)
 		return -EINVAL;
 
 	seed = get_random_long();
-	seed_inc = get_random_int();
+	seed_inc = get_random_long();
 
 	list_for_each_entry(srv, &sg->srv_list, list)
 		conn_n += srv->conn_n;
@@ -405,7 +405,7 @@ tfw_sched_hash_add_srv(TfwServer *srv)
 		return -EEXIST;
 
 	seed = get_random_long();
-	seed_inc = get_random_int();
+	seed_inc = get_random_long();
 
 	size = sizeof(TfwHashConnList) + srv->conn_n * sizeof(TfwHashConn);
 	if (!(cl = kzalloc(size, GFP_KERNEL)))
diff --git a/fw/htype.h b/fw/htype.h
index a746070ad..a45c27044 100644
--- a/fw/htype.h
+++ b/fw/htype.h
@@ -22,6 +22,8 @@
 #ifndef __HTYPE_H__
 #define __HTYPE_H__
 
+#include <linux/cache.h>
+
 /**
  * ASCII codes to accept HTTP token (RFC 7230 3.2.6).
  */
diff --git a/fw/main.c b/fw/main.c
index 9098b72de..6e6a4c194 100644
--- a/fw/main.c
+++ b/fw/main.c
@@ -353,7 +353,7 @@ tfw_ctlfn_state_io(struct ctl_table *ctl, int is_write,
 			goto out;
 
 		r = tfw_ctlfn_state_change(buf);
-		strlcpy(new_state_buf,
+		strscpy(new_state_buf,
 			tfw_runstate_is_started() ? "start" : "stop",
 			T_SYSCTL_STBUF_LEN);
 	} else {
@@ -403,7 +403,7 @@ tfw_ctlfn_errinj_io(struct ctl_table *ctl, int is_write,
 		if ((r = tfw_cntl_errinj_change(buf)))
 			goto out;
 
-		strlcpy(errinj_buf, buf, T_SYSCTL_ERRINJ_STBUF_LEN);
+		strscpy(errinj_buf, buf, T_SYSCTL_ERRINJ_STBUF_LEN);
 	} else {
 		struct errinj *inj;
 
@@ -411,7 +411,7 @@ tfw_ctlfn_errinj_io(struct ctl_table *ctl, int is_write,
 		if (inj) {
 			errinj_to_str(inj, buf, sizeof(buf));
 		} else {
-			strlcpy(buf, "NONE", sizeof(buf));
+			strscpy(buf, "NONE", sizeof(buf));
 		}
 
 		r = proc_dostring(&tmp, is_write, user_buf, lenp, ppos);
diff --git a/fw/procfs.c b/fw/procfs.c
index 793001c68..2ffabc378 100644
--- a/fw/procfs.c
+++ b/fw/procfs.c
@@ -216,7 +216,7 @@ tfw_perfstat_seq_show(struct seq_file *seq, void *off)
 static int
 tfw_perfstat_seq_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, tfw_perfstat_seq_show, PDE_DATA(inode));
+	return single_open(file, tfw_perfstat_seq_show, pde_data(inode));
 }
 
 static int
@@ -304,8 +304,8 @@ static int
 tfw_srvstats_seq_open(struct inode *inode, struct file *file)
 {
 	if (!tfw_runstate_is_reconfig())
-		return single_open(file, tfw_srvstats_seq_show, PDE_DATA(inode));
-	return single_open(file, tfw_srvstats_seq_reconfig, PDE_DATA(inode));
+		return single_open(file, tfw_srvstats_seq_show, pde_data(inode));
+	return single_open(file, tfw_srvstats_seq_reconfig, pde_data(inode));
 }
 
 static int
diff --git a/fw/sock.c b/fw/sock.c
index b019083e5..46d57b235 100644
--- a/fw/sock.c
+++ b/fw/sock.c
@@ -366,7 +366,7 @@ ss_forced_mem_schedule(struct sock *sk, int size)
 	if (size <= sk->sk_forward_alloc)
 		return;
 	amt = sk_mem_pages(size);
-	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
+	sk->sk_forward_alloc += amt * PAGE_SIZE;
 	sk_memory_allocated_add(sk, amt);
 }
 
@@ -423,7 +423,7 @@ ss_do_send(struct sock *sk, struct sk_buff **skb_head, int flags)
 		       skb_tfw_tls_type(skb));
 
 		ss_forced_mem_schedule(sk, skb->truesize);
-		skb_entail(sk, skb);
+		tcp_skb_entail(sk, skb);
 
 		tp->write_seq += skb->len;
 		TCP_SKB_CB(skb)->end_seq += skb->len;
@@ -623,7 +623,7 @@ ss_do_close(struct sock *sk, int flags)
 	if (waitqueue_active(&sk->sk_lock.wq))
 		wake_up(&sk->sk_lock.wq);
 
-	percpu_counter_inc(sk->sk_prot->orphan_count);
+	this_cpu_inc(*sk->sk_prot->orphan_count);
 
 	if (sk->sk_state == TCP_FIN_WAIT2) {
 		const int tmo = tcp_fin_time(sk);
@@ -1209,13 +1209,13 @@ ss_inet_create(struct net *net, int family,
 		 * socket-related functions assume that sk->sk_cgrp_data.val
 		 * is always non-zero.
 		 */
-		sk->sk_cgrp_data.val = (unsigned long) &cgrp_dfl_root.cgrp;
+		sk->sk_cgrp_data.cgroup = &cgrp_dfl_root.cgrp;
 	}
 
+	inet_set_bit(IS_ICSK, sk);
+	inet_clear_bit(NODEFRAG, sk);
 	inet = inet_sk(sk);
-	inet->is_icsk = 1;
-	inet->nodefrag = 0;
-	inet->inet_id = 0;
+	atomic_set(&inet->inet_id, 0);
 
 	if (net->ipv4.sysctl_ip_no_pmtu_disc)
 		inet->pmtudisc = IP_PMTUDISC_DONT;
@@ -1238,21 +1238,20 @@ ss_inet_create(struct net *net, int family,
 					(((u8 *)sk) + offset);
 		np->hop_limit = -1;
 		np->mcast_hops = IPV6_DEFAULT_MCASTHOPS;
-		np->mc_loop = 1;
+		inet6_set_bit(MC_LOOP, sk);
 		np->pmtudisc = IPV6_PMTUDISC_WANT;
 		sk->sk_ipv6only = net->ipv6.sysctl.bindv6only;
 		inet->pinet6 = np;
 	}
 
 	inet->uc_ttl = -1;
-	inet->mc_loop = 1;
-	inet->mc_ttl = 1;
-	inet->mc_all = 1;
+	inet_set_bit(MC_LOOP, sk);
+	inet_set_bit(TTL, sk);
+	inet_set_bit(MC_ALL, sk);
 	inet->mc_index = 0;
 	inet->mc_list = NULL;
 	inet->rcv_tos = 0;
 
-	sk_refcnt_debug_inc(sk);
 	if (sk->sk_prot->init && (err = sk->sk_prot->init(sk))) {
 		T_ERR("cannot create socket, %d\n", err);
 		sk_common_release(sk);
@@ -1409,7 +1408,7 @@ ss_getpeername(struct sock *sk, TfwAddr *addr)
 	if (inet6_sk(sk)) {
 		struct ipv6_pinfo *np = inet6_sk(sk);
 		addr->sin6_addr = sk->sk_v6_daddr;
-		addr->sin6_flowinfo = np->sndflow ? np->flow_label : 0;
+		addr->sin6_flowinfo = inet6_test_bit(SNDFLOW, sk) ? np->flow_label : 0;
 		addr->in6_prefix = ipv6_iface_scope_id(&addr->sin6_addr,
 						       sk->sk_bound_dev_if);
 	} else
diff --git a/fw/sock_clnt.c b/fw/sock_clnt.c
index 4e866c0c3..cd00c5ab7 100644
--- a/fw/sock_clnt.c
+++ b/fw/sock_clnt.c
@@ -811,7 +811,7 @@ tfw_listen_sock_start(TfwListenSock *ls)
 
 	ss_set_listen(sk);
 
-	inet_sk(sk)->freebind = 1;
+	inet_set_bit(FREEBIND, sk);
 	sk->sk_reuse = 1;
 	r = ss_bind(sk, addr);
 	if (r) {
diff --git a/fw/ss_skb.c b/fw/ss_skb.c
index ddfdc3612..0c0766eec 100644
--- a/fw/ss_skb.c
+++ b/fw/ss_skb.c
@@ -536,7 +536,7 @@ __split_pgfrag_del_w_frag(struct sk_buff *skb_head, struct sk_buff *skb, int i,
 	/* Fast path: delete a full fragment. */
 	if (unlikely(!off && len == skb_frag_size(frag))) {
 		ss_skb_adjust_data_len(skb, -len);
-		__skb_frag_unref(frag);
+		__skb_frag_unref(frag, false);
 		if (i + 1 < si->nr_frags)
 			memmove(&si->frags[i], &si->frags[i + 1],
 				(si->nr_frags - i - 1) * sizeof(skb_frag_t));
@@ -1326,9 +1326,7 @@ ss_skb_init_for_xmit(struct sk_buff *skb)
 	skb->mac_len = 0;
 	skb->queue_mapping = 0;
 	skb->peeked = 0;
-	bzero_fast(&skb->headers_start,
-		   offsetof(struct sk_buff, headers_end) -
-		   offsetof(struct sk_buff, headers_start));
+	bzero_fast(&skb->headers, sizeof(skb->headers));
 	skb->pfmemalloc = pfmemalloc;
 	skb->mac_header = (typeof(skb->mac_header))~0U;
 	skb->transport_header = (typeof(skb->transport_header))~0U;
@@ -1552,7 +1550,7 @@ ss_skb_to_sgvec_with_new_pages(struct sk_buff *skb, struct scatterlist *sgl,
 	int i;
 
 	/* TODO: process of SKBTX_ZEROCOPY_FRAG for MSG_ZEROCOPY */
-	if (skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG) {
+	if (skb_shinfo(skb)->tx_flags & SKBFL_SHARED_FRAG) {
 		if (head_data_len) {
 			sg_set_buf(sgl + out_frags, skb->data, head_data_len);
 			out_frags++;
@@ -1588,7 +1586,7 @@ ss_skb_to_sgvec_with_new_pages(struct sk_buff *skb, struct scatterlist *sgl,
 		}
 		if (out_frags > 0)
 			sg_mark_end(&sgl[out_frags - 1]);
-		skb_shinfo(skb)->tx_flags &= ~SKBTX_SHARED_FRAG;
+		skb_shinfo(skb)->tx_flags &= ~SKBFL_SHARED_FRAG;
 	} else {
 		int r = skb_to_sgvec(skb, sgl + out_frags, 0, skb->len);
 		if (r <= 0)
diff --git a/fw/str_avx2.S b/fw/str_avx2.S
index 239af7eaf..450f3003b 100644
--- a/fw/str_avx2.S
+++ b/fw/str_avx2.S
@@ -22,8 +22,7 @@
  * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  */
 #include <linux/linkage.h>
-#include <asm/alternative-asm.h>
-#include <asm/export.h>
+#include <asm/asm.h>
 #include <asm/nospec-branch.h>
 #include <fw/token_tables.h>
 #include <fw/token_mask.h>
diff --git a/fw/t/Makefile b/fw/t/Makefile
index d3c66254d..af59f23ff 100644
--- a/fw/t/Makefile
+++ b/fw/t/Makefile
@@ -21,8 +21,8 @@ export TFW_CFLAGS
 EXTRA_CFLAGS += $(TFW_CFLAGS) -I$(src)/.. -I$(src)/../../
 EXTRA_CFLAGS += $(TTLS_CFLAGS)
 
-obj-m += unit/
+#obj-m += unit/
 
-obj-m += tfw_fuzzer.o
+#obj-m += tfw_fuzzer.o
 tfw_fuzzer-objs = \
 	fuzzer.o
diff --git a/ktest/crypto/sha512_base.h b/ktest/crypto/sha512_base.h
new file mode 100644
index 000000000..949e1d1f4
--- /dev/null
+++ b/ktest/crypto/sha512_base.h
@@ -0,0 +1,43 @@
+/**
+ *	Tempesta kernel emulation unit testing framework.
+ *
+ * Copyright (C) 2019 Tempesta Technologies, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+#ifndef __CRYPTO_SHA_H__
+#define __CRYPTO_SHA_H__
+
+#include "linux/compiler.h"
+
+#define SHA512_DIGEST_SIZE	64
+#define SHA512_BLOCK_SIZE	128
+
+#define SHA256_DIGEST_SIZE	32
+#define SHA256_BLOCK_SIZE	64
+
+struct sha512_state {
+	u64 state[SHA512_DIGEST_SIZE / 8];
+	u64 count[2];
+	u8 buf[SHA512_BLOCK_SIZE];
+};
+
+struct sha256_state {
+	u32 state[SHA256_DIGEST_SIZE / 4];
+	u64 count[2];
+	u8 buf[SHA256_BLOCK_SIZE];
+};
+
+#endif /* __CRYPTO_SHA_H__ */
diff --git a/ktest/linux/kernel.h b/ktest/linux/kernel.h
index 0e52cd915..7dafcb6a3 100644
--- a/ktest/linux/kernel.h
+++ b/ktest/linux/kernel.h
@@ -122,6 +122,12 @@ get_random_bytes_arch(void *buf, int nbytes)
 	return nbytes;
 }
 
+static inline int
+get_random_bytes_wait(void *buf, int nbytes)
+{
+	return get_random_bytes_arch(buf, nbytes);
+}
+
 static inline void
 get_random_bytes(void *buf, int nbytes)
 {
diff --git a/lib/str_simd.S b/lib/str_simd.S
index 0ecb0e87c..b57f0635a 100644
--- a/lib/str_simd.S
+++ b/lib/str_simd.S
@@ -27,8 +27,6 @@
  * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  */
 #include <linux/linkage.h>
-#include <asm/alternative-asm.h>
-#include <asm/export.h>
 
 SYM_FUNC_START(__memcpy_fast)
 	movq	%rdx, %rax
diff --git a/tls/bignum_x86-64.S b/tls/bignum_x86-64.S
index 931bda51b..999a19e1e 100644
--- a/tls/bignum_x86-64.S
+++ b/tls/bignum_x86-64.S
@@ -18,7 +18,6 @@
  * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  */
 #include <linux/linkage.h>
-#include <asm/export.h>
 #include <asm/nospec-branch.h>
 
 /*
diff --git a/tls/crypto.h b/tls/crypto.h
index 3a9dd7673..643c1d1fd 100644
--- a/tls/crypto.h
+++ b/tls/crypto.h
@@ -23,7 +23,7 @@
 #define __TTLS_CRYPTO_H__
 
 #include <crypto/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha512_base.h>
 
 /* An enumeration of supported ciphers. */
 typedef enum {
diff --git a/tls/ecdh.c b/tls/ecdh.c
index 7e8edb616..57463cce4 100644
--- a/tls/ecdh.c
+++ b/tls/ecdh.c
@@ -29,6 +29,7 @@
  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  */
 #include "lib/str.h"
+#include <linux/errno.h>
 
 #include "ecdh.h"
 
diff --git a/tls/tls_internal.h b/tls/tls_internal.h
index 6d3a3f652..19bf5f695 100644
--- a/tls/tls_internal.h
+++ b/tls/tls_internal.h
@@ -402,7 +402,7 @@ unsigned long ttls_time_debug(void);
 #define ttls_time()		ttls_time_debug()
 
 #else
-#define ttls_time()		get_seconds()
+#define ttls_time()		ktime_get_seconds()
 /*
  * CPUs since Intel Ice Lake are safe against SRBDS attack, so we're good
  * with the hardware random generator.
@@ -410,7 +410,7 @@ unsigned long ttls_time_debug(void);
 static inline void
 ttls_rnd(void *buf, int len)
 {
-	int n = get_random_bytes_arch(buf, len);
+	int n = get_random_bytes_wait(buf, len);
 
 	if (unlikely(n < len))
 		get_random_bytes((char *)buf + n, len - n);

From 8e0e9423a89f04d033cddfa93149cb7adee84ed6 Mon Sep 17 00:00:00 2001
From: kingluo <home_king@163.com>
Date: Sat, 11 May 2024 18:34:17 +0800
Subject: [PATCH 02/25] apply stats fix in advanced

---
 fw/procfs.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fw/procfs.c b/fw/procfs.c
index 2ffabc378..bbe34ef54 100644
--- a/fw/procfs.c
+++ b/fw/procfs.c
@@ -158,10 +158,6 @@ tfw_perfstat_seq_show(struct seq_file *seq, void *off)
 		seq_printf(seq, "SS backlog's sizes\t\t\t: n/a\n");
 	}
 
-	/* Socket buffers kernel statistics. */
-	seq_printf(seq, "Socket buffers in flight\t\t: %ld\n",
-		   __get_skb_count());
-
 	/* Cache statistics. */
 	SPRN("Cache hits\t\t\t\t", cache.hits);
 	SPRN("Cache misses\t\t\t\t", cache.misses);

From dc6d26d2d9f4adeb50a027b05aa936a17e801633 Mon Sep 17 00:00:00 2001
From: kingluo <home_king@163.com>
Date: Sat, 11 May 2024 18:34:58 +0800
Subject: [PATCH 03/25] remove duplicated sk->sk_cgrp_data.cgroup init

---
 fw/sock.c | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/fw/sock.c b/fw/sock.c
index 46d57b235..ff89d61cf 100644
--- a/fw/sock.c
+++ b/fw/sock.c
@@ -1202,16 +1202,6 @@ ss_inet_create(struct net *net, int family,
 	if (!(sk = sk_alloc(net, pfinet, GFP_ATOMIC, answer_prot, 1)))
 		return -ENOBUFS;
 
-	if (in_interrupt()) {
-		/*
-		 * When called from an interrupt context, sk_alloc() does not
-		 * initialize sk->sk_cgrp_data, so we must do it here. Other
-		 * socket-related functions assume that sk->sk_cgrp_data.val
-		 * is always non-zero.
-		 */
-		sk->sk_cgrp_data.cgroup = &cgrp_dfl_root.cgrp;
-	}
-
 	inet_set_bit(IS_ICSK, sk);
 	inet_clear_bit(NODEFRAG, sk);
 	inet = inet_sk(sk);

From bca16f5c9c5700b6848fb3bed8e8cc553ae41f6a Mon Sep 17 00:00:00 2001
From: kingluo <home_king@163.com>
Date: Sat, 11 May 2024 18:36:11 +0800
Subject: [PATCH 04/25] disable AVX2 temporarily before fpu works

---
 Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index b861a6b61..1f5178d81 100644
--- a/Makefile
+++ b/Makefile
@@ -112,8 +112,8 @@ ifeq (, $(findstring pse, $(PROC)))
 	ERROR = "1MB huge pages support is required"
 endif
 ifneq (, $(findstring avx2, $(PROC)))
-	AVX2 = "y"
-	TFW_CFLAGS += -DAVX2=1
+	#AVX2 = "y"
+	#TFW_CFLAGS += -DAVX2=1
 endif
 ifneq (, $(findstring bmi2, $(PROC)))
 	BMI2 = "y"

From 1040a540a23e3cdf6d1750efedeafee168e74c52 Mon Sep 17 00:00:00 2001
From: kingluo <home_king@163.com>
Date: Tue, 14 May 2024 21:45:56 +0800
Subject: [PATCH 05/25] bignum_x86-64.S: replace ret with RET, to use
 __x86_return_thunk

https://github.com/tempesta-tech/tempesta/issues/1808#issuecomment-2110165831
---
 tls/bignum.c        | 270 +++++++++++++++++++++++++++++++++++++++++++-
 tls/bignum.h        |  10 +-
 tls/bignum_x86-64.S | 160 ++++----------------------
 3 files changed, 298 insertions(+), 142 deletions(-)

diff --git a/tls/bignum.c b/tls/bignum.c
index d9c25a3e2..73fb9fe22 100644
--- a/tls/bignum.c
+++ b/tls/bignum.c
@@ -276,6 +276,46 @@ ttls_mpi_size(const TlsMpi *X)
  * All the shifts for more than 64 bits are for integer number of limbs,
  * so the straightforward 2n algorithm is fine to make the each case simpler.
  */
+int
+ttls_mpi_shift_ll(TlsMpi *X, size_t count)
+{
+	size_t v0, t1, old_used = X->used, i = ttls_mpi_bitlen(X);
+	unsigned long r0 = 0, r1, *p = MPI_P(X);
+
+	if (unlikely(!i))
+		return 0;
+
+	v0 = count >> BSHIFT;
+	t1 = count & BMASK;
+	i += count;
+
+	if (WARN_ON_ONCE((X->limbs << BSHIFT) < i))
+		return -ENOSPC;
+
+	X->used = BITS_TO_LIMBS(i);
+	if (old_used < X->used)
+		bzero_fast(p + old_used, (X->used - old_used) * CIL);
+
+	/* Shift by count / limb_size. */
+	if (v0 > 0) {
+		for (i = X->used; i > v0; i--)
+			p[i - 1] = p[i - v0 - 1];
+		for ( ; i > 0; i--)
+			p[i - 1] = 0;
+	}
+
+	/* shift by count % limb_size. */
+	if (t1 > 0) {
+		for (i = v0; i < X->used; i++) {
+			r1 = p[i] >> (BIL - t1);
+			p[i] <<= t1;
+			p[i] |= r0;
+			r0 = r1;
+		}
+	}
+
+	return 0;
+}
 void
 ttls_mpi_shift_l(TlsMpi *X, const TlsMpi *A, size_t count)
 {
@@ -331,6 +371,50 @@ ttls_mpi_shift_l(TlsMpi *X, const TlsMpi *A, size_t count)
  * All the shifts for more than 64 bits are for integer number of limbs,
  * so the straightforward 2n algorithm is fine to make the each case simpler.
  */
+int
+ttls_mpi_shift_r(TlsMpi *X, size_t count)
+{
+	size_t i, v0, v1;
+	unsigned long r0 = 0, r1;
+
+	if (unlikely(!X->used || !MPI_P(X)[X->used - 1])) {
+		WARN_ON_ONCE(X->used > 1);
+		return 0;
+	}
+
+	v0 = count >> BSHIFT;
+	v1 = count & BMASK;
+
+	if (v0 > X->used || (v0 == X->used && v1 > 0)) {
+		ttls_mpi_lset(X, 0);
+		return 0;
+	}
+
+	/*
+	 * Shift by count / limb_size - remove least significant limbs.
+	 * There could be garbage after last used limb, so be careful.
+	 */
+	if (v0 > 0) {
+		X->used -= v0;
+		for (i = 0; i < X->used; i++)
+			MPI_P(X)[i] = MPI_P(X)[i + v0];
+	}
+
+	/* Shift by count % limb_size. */
+	if (v1 > 0) {
+		for (i = X->used; i > 0; i--) {
+			r1 = MPI_P(X)[i - 1] << (BIL - v1);
+			MPI_P(X)[i - 1] >>= v1;
+			MPI_P(X)[i - 1] |= r0;
+			r0 = r1;
+		}
+		if (!MPI_P(X)[X->used - 1])
+			--X->used;
+	}
+
+	return 0;
+}
+#if 0
 void
 ttls_mpi_shift_r(TlsMpi *X, size_t count)
 {
@@ -385,6 +469,7 @@ ttls_mpi_shift_r(TlsMpi *X, size_t count)
 		X->s = 1;
 	bzero_fast(x + X->used, (X->limbs - X->used) * CIL);
 }
+#endif
 
 #if DBG_TLS
 /**
@@ -588,6 +673,58 @@ ttls_mpi_cmp_int(const TlsMpi *X, long z)
  *
  * @A and @B must be different, but either of them can accept the result @X.
  */
+#if 1
+int
+ttls_mpi_add_abs(TlsMpi *X, const TlsMpi *A, const TlsMpi *B)
+{
+	size_t i;
+	unsigned long *a, *b, *x, c = 0;
+
+	BUG_ON(A == B);
+	if (X == B) {
+		const TlsMpi *T = A;
+		A = X;
+		B = T;
+	}
+
+	/* X should always be positive as a result of unsigned additions. */
+	X->s = 1;
+
+	if (WARN_ON_ONCE(X->limbs < max_t(unsigned short, A->used, B->used)))
+		return -ENOSPC;
+	X->used = A->used;
+
+	a = MPI_P(A);
+	b = MPI_P(B);
+	x = MPI_P(X);
+	/* TODO #1064 move out condition from under the loop. */
+	for (i = 0; i < B->used; i++, a++, b++, x++) {
+		if (i == X->used) {
+			++X->used;
+			*x = c;
+		} else {
+			*x = *a + c;
+		}
+		c = *x < c;
+		*x += *b;
+		c += *x < *b;
+	}
+	for ( ; c; i++, a++, x++) {
+		BUG_ON(i >= X->limbs);
+		if (i == X->used) {
+			++X->used;
+			*x = c;
+		} else {
+			*x = *a + c;
+		}
+		c = *x < c;
+	}
+	if (X != A && X->used > i)
+		memcpy_fast(x, a, (X->used - i) * CIL);
+
+	return 0;
+}
+#else
 void
 ttls_mpi_add_abs(TlsMpi *X, const TlsMpi *A, const TlsMpi *B)
 {
@@ -611,11 +748,55 @@ ttls_mpi_add_abs(TlsMpi *X, const TlsMpi *A, const TlsMpi *B)
 	/* X should always be positive as a result of unsigned additions. */
 	X->s = 1;
 }
+#endif
+
+static void
+__mpi_sub(unsigned long *a, size_t a_len, unsigned long *b, size_t b_len,
+	  unsigned long *r)
+{
+	unsigned long c = 0, z, b_tmp, *b_end = b + b_len, *a_end = a + a_len;
+
+	BUG_ON(a_len < b_len);
 
+	for ( ; b < b_end; a++, b++, r++) {
+		z = *a < c;
+		b_tmp = *b;
+		*r = *a - c;
+		c = (*r < b_tmp) + z;
+		*r -= b_tmp;
+	}
+	while (c) {
+		z = *a < c;
+		*r = *a - c;
+		c = z;
+		a++;
+		r++;
+	}
+	BUG_ON(a > a_end);
+	memcpy_fast(r, a, (a_end - a) * CIL);
+}
 /**
  * Unsigned subtraction: X = |A| - |B|.
  * @X may reference either @A or @B.
  */
+#if 1
+int
+ttls_mpi_sub_abs(TlsMpi *X, const TlsMpi *A, const TlsMpi *B)
+{
+	if (ttls_mpi_cmp_abs(A, B) < 0)
+		return -EINVAL;
+
+	ttls_mpi_alloc(X, A->used);
+
+	__mpi_sub(MPI_P(A), A->used, MPI_P(B), B->used, MPI_P(X));
+
+	/* X should always be positive as a result of unsigned subtractions. */
+	X->s = 1;
+	mpi_fixup_used(X, A->used);
+
+	return 0;
+}
+#else
 void
 ttls_mpi_sub_abs(TlsMpi *X, const TlsMpi *A, const TlsMpi *B)
 {
@@ -645,13 +826,15 @@ ttls_mpi_sub_abs(TlsMpi *X, const TlsMpi *A, const TlsMpi *B)
 		*x = *a - *b;
 	}
 	else {
-		mpi_sub_x86_64(x, b, a, b_sz, a_sz);
+		//mpi_sub_x86_64(x, b, a, b_sz, a_sz);
+	__mpi_sub(MPI_P(A), A->used, MPI_P(B), B->used, MPI_P(X));
 	}
 
 	/* X should always be positive as a result of unsigned subtractions. */
 	X->s = 1;
 	mpi_fixup_used(X, a_sz);
 }
+#endif
 
 /**
  * Signed addition: X = A + B
@@ -685,6 +868,31 @@ ttls_mpi_add_mpi(TlsMpi *X, const TlsMpi *A, const TlsMpi *B)
 /**
  * Signed subtraction: X = A - B
  */
+#if 1
+int
+ttls_mpi_sub_mpi(TlsMpi *X, const TlsMpi *A, const TlsMpi *B)
+{
+	int r, s = A->s;
+
+	if (A->s * B->s > 0) {
+		if (ttls_mpi_cmp_abs(A, B) >= 0) {
+			if ((r = ttls_mpi_sub_abs(X, A, B)))
+				return r;
+			X->s = s;
+		} else {
+			if ((r = ttls_mpi_sub_abs(X, B, A)))
+				return r;
+			X->s = -s;
+		}
+	} else {
+		if ((r = ttls_mpi_add_abs(X, A, B)))
+			return r;
+		X->s = s;
+	}
+
+	return 0;
+}
+#else
 void
 ttls_mpi_sub_mpi(TlsMpi *X, const TlsMpi *A, const TlsMpi *B)
 {
@@ -710,6 +918,7 @@ ttls_mpi_sub_mpi(TlsMpi *X, const TlsMpi *A, const TlsMpi *B)
 		X->s = s;
 	}
 }
+#endif
 
 /**
  * Signed addition: X = A + b
@@ -726,6 +935,18 @@ ttls_mpi_add_int(TlsMpi *X, const TlsMpi *A, long b)
 /**
  * Signed subtraction: X = A - b
  */
+#if 1
+int
+ttls_mpi_sub_int(TlsMpi *X, const TlsMpi *A, long b)
+{
+	DECLARE_MPI_AUTO(_B, 1);
+	MPI_P(&_B)[0] = (b < 0) ? -b : b;
+	_B.s = (b < 0) ? -1 : 1;
+	_B.limbs = _B.used = 1;
+
+	return ttls_mpi_sub_mpi(X, A, &_B);
+}
+#else
 void
 ttls_mpi_sub_int(TlsMpi *X, const TlsMpi *A, long b)
 {
@@ -745,6 +966,7 @@ ttls_mpi_sub_int(TlsMpi *X, const TlsMpi *A, long b)
 		X->s = -1;
 	}
 }
+#endif
 
 #define MULADDC_INIT							\
 	asm(	"xorq	%%r8, %%r8	\n\t"
@@ -1121,6 +1343,49 @@ __mpi_montg_init(unsigned long *mm, const TlsMpi *N)
  * TODO #1335: this is used for modular exponentiation only, so repalce it with
  * an adequate assembly implementation for the RSA handshakes.
  */
+#if 1
+static int
+__mpi_montmul(TlsMpi *A, const TlsMpi *B, const TlsMpi *N, unsigned long mm,
+	      TlsMpi *T)
+{
+	size_t i, n, m;
+	unsigned long u0, u1, *d;
+
+	BUG_ON(T->limbs < N->used + 1);
+	bzero_fast(MPI_P(T), T->limbs * CIL);
+
+	d = MPI_P(T);
+	n = N->used;
+	m = (B->used < n) ? B->used : n;
+
+	for (i = 0; i < n; i++) {
+		/* T = (T + u0*B + u1*N) / 2^BIL */
+		u0 = MPI_P(A)[i];
+		u1 = (d[0] + u0 * MPI_P(B)[0]) * mm;
+
+		__mpi_mul(m, MPI_P(B), d, u0);
+		__mpi_mul(n, MPI_P(N), d, u1);
+
+		*d++ = u0;
+		d[n + 1] = 0;
+	}
+	mpi_fixup_used(T, T->limbs);
+
+	memcpy_fast(MPI_P(A), d, (n + 1) * CIL);
+	mpi_fixup_used(A, n + 1);
+
+	if (ttls_mpi_cmp_abs(A, N) >= 0) {
+		__mpi_sub(MPI_P(A), A->used, MPI_P(N), N->used, MPI_P(A));
+		mpi_fixup_used(A, A->used);
+	} else {
+		/* Prevent timing attacks. */
+		__mpi_sub(MPI_P(T), T->used, MPI_P(A), A->used, MPI_P(T));
+		mpi_fixup_used(T, T->used);
+	}
+
+	return 0;
+}
+#else
 static int
 __mpi_montmul(TlsMpi *A, const TlsMpi *B, const TlsMpi *N, unsigned long mm,
 	      TlsMpi *T)
@@ -1162,6 +1427,7 @@ __mpi_montmul(TlsMpi *A, const TlsMpi *B, const TlsMpi *N, unsigned long mm,
 
 	return 0;
 }
+#endif
 
 /**
  * Montgomery reduction: A = A * R^-1 mod N
@@ -1374,7 +1640,7 @@ ttls_mpi_gcd(TlsMpi *G, const TlsMpi *A, const TlsMpi *B)
 	}
 
 	if (lz)
-		ttls_mpi_shift_l(G, &TB, lz);
+		ttls_mpi_shift_ll(&TB, lz);
 	else
 		ttls_mpi_copy(G, &TB);
 }
diff --git a/tls/bignum.h b/tls/bignum.h
index 3cdb676a7..aa0c4e1a6 100644
--- a/tls/bignum.h
+++ b/tls/bignum.h
@@ -183,7 +183,7 @@ int ttls_mpi_safe_cond_swap(TlsMpi *X, TlsMpi *Y, unsigned char swap);
 void ttls_mpi_lset(TlsMpi *X, long z);
 
 void ttls_mpi_shift_l(TlsMpi *X, const TlsMpi *A, size_t count);
-void ttls_mpi_shift_r(TlsMpi *X, size_t count);
+int ttls_mpi_shift_r(TlsMpi *X, size_t count);
 int ttls_mpi_get_bit(const TlsMpi *X, size_t pos);
 void ttls_mpi_set_bit(TlsMpi *X, size_t pos, unsigned char val);
 size_t ttls_mpi_lsb(const TlsMpi *X);
@@ -193,13 +193,13 @@ int ttls_mpi_cmp_abs(const TlsMpi *X, const TlsMpi *Y);
 int ttls_mpi_cmp_mpi(const TlsMpi *X, const TlsMpi *Y);
 int ttls_mpi_cmp_int(const TlsMpi *X, long z);
 
-void ttls_mpi_add_abs(TlsMpi *X, const TlsMpi *A, const TlsMpi *B);
+int ttls_mpi_add_abs(TlsMpi *X, const TlsMpi *A, const TlsMpi *B);
 void ttls_mpi_add_mpi(TlsMpi *X, const TlsMpi *A, const TlsMpi *B);
 void ttls_mpi_add_int(TlsMpi *X, const TlsMpi *A, long b);
 
-void ttls_mpi_sub_abs(TlsMpi *X, const TlsMpi *A, const TlsMpi *B);
-void ttls_mpi_sub_mpi(TlsMpi *X, const TlsMpi *A, const TlsMpi *B);
-void ttls_mpi_sub_int(TlsMpi *X, const TlsMpi *A, long b);
+int ttls_mpi_sub_abs(TlsMpi *X, const TlsMpi *A, const TlsMpi *B);
+int ttls_mpi_sub_mpi(TlsMpi *X, const TlsMpi *A, const TlsMpi *B);
+int ttls_mpi_sub_int(TlsMpi *X, const TlsMpi *A, long b);
 
 void ttls_mpi_mul_mpi(TlsMpi *X, const TlsMpi *A, const TlsMpi *B);
 void ttls_mpi_mul_uint(TlsMpi *X, const TlsMpi *A, unsigned long b);
diff --git a/tls/bignum_x86-64.S b/tls/bignum_x86-64.S
index 999a19e1e..5588f91d7 100644
--- a/tls/bignum_x86-64.S
+++ b/tls/bignum_x86-64.S
@@ -45,7 +45,7 @@ SYM_FUNC_START(mpi_cmp_x86_64_4)
 	jnz	.cmp_4_done
 .cmp_4_done:
 	cmovbq	%rdx, %rax
-	retq
+	RET
 SYM_FUNC_END(mpi_cmp_x86_64_4)
 
 
@@ -95,10 +95,10 @@ SYM_FUNC_START(mpi_add_x86_64)
 	incq	%rax
 
 .add_done:
-	ret
+	RET
 .enospc:
 	movq	$-1, %rax
-	ret
+	RET
 SYM_FUNC_END(mpi_add_x86_64)
 
 
@@ -134,120 +134,10 @@ SYM_FUNC_START(mpi_add_mod_p256_x86_64)
 	sbbq	%r11, %r9
 	movq	%r8, 16(%rdi)
 	movq	%r9, 24(%rdi)
-	retq
+	RET
 SYM_FUNC_END(mpi_add_mod_p256_x86_64)
 
 
-/**
- * Subtract X = A - B, where A->used >= B->used.
- *
- * %RDI	- pointer to X;
- * %RSI	- pointer to B;
- * %RDX	- pointer to A;
- * %RCX	- B->used (used directly for looping);
- * %R8	- A->used.
- *
- * TODO #1335 it seems we can throw out the generic-length functions.
- */
-SYM_FUNC_START(mpi_sub_x86_64)
-	subq	%rcx, %r8
-	addq	$1, %r8
-
-	/* Get code address by size of tail. */
-.section .rodata
-.align 8
-.sub_tail_jmp_tbl:
-	.quad	.sub_tail0
-	.quad	.sub_tail1
-	.quad	.sub_tail2
-	.quad	.sub_tail3
-.text
-	pushq	%rbx
-	movq	%rcx, %rbx
-	andq	$3, %rbx
-	movq	.sub_tail_jmp_tbl(, %rbx, 8), %rbx
-
-	xorq	%rax, %rax
-	shrq	$2, %rcx
-	jz	.sub_small_b
-	pushq	%r12
-	clc
-.sub_by_4:
-	movq	(%rdx, %rax, 8), %r9
-	movq	8(%rdx, %rax, 8), %r10
-	movq	16(%rdx, %rax, 8), %r11
-	movq	24(%rdx, %rax, 8), %r12
-	sbbq	(%rsi, %rax, 8), %r9
-	sbbq	8(%rsi, %rax, 8), %r10
-	sbbq	16(%rsi, %rax, 8), %r11
-	sbbq	24(%rsi, %rax, 8), %r12
-	movq	%r9, (%rdi, %rax, 8)
-	incq	%rax
-	movq	%r10, (%rdi, %rax, 8)
-	incq	%rax
-	movq	%r11, (%rdi, %rax, 8)
-	incq	%rax
-	movq	%r12, (%rdi, %rax, 8)
-	incq	%rax
-	loop	.sub_by_4
-	popq	%r12
-	ANNOTATE_RETPOLINE_SAFE
-	jmpq	*%rbx
-.sub_small_b:
-	clc
-	ANNOTATE_RETPOLINE_SAFE
-	jmpq	*%rbx
-
-.sub_tail3:
-	movq	(%rdx, %rax, 8), %r9
-	sbbq	(%rsi, %rax, 8), %r9
-	movq	%r9, (%rdi, %rax, 8)
-	incq	%rax
-.sub_tail2:
-	movq	(%rdx, %rax, 8), %r10
-	sbbq	(%rsi, %rax, 8), %r10
-	movq	%r10, (%rdi, %rax, 8)
-	incq	%rax
-.sub_tail1:
-	movq	(%rdx, %rax, 8), %r11
-	sbbq	(%rsi, %rax, 8), %r11
-	movq	%r11, (%rdi, %rax, 8)
-	incq	%rax
-.sub_tail0:
-	popq	%rbx
-
-	/*
-	 * Borrow required digets from the more significant limbs in @A.
-	 * There is either CF = 0 or we have more limbs in @A.
-	 */
-	movq	%r8, %rcx
-	jnc	.copy_msb
-	jmp	.propagate_borrow
-.propagate_borrow_loop:
-	movq	(%rdx, %rax, 8), %r10
-	sbbq	$0, %r10
-	movq	%r10, (%rdi, %rax, 8)
-	incq	%rax
-	jnc	.need_copy
-.propagate_borrow:
-	loop	.propagate_borrow_loop
-	ud2
-
-	/* Copy the rest of A to X if no need to borrow. */
-.copy_msb_loop:
-	movq	(%rdx, %rax, 8), %r10
-	movq	%r10, (%rdi, %rax, 8)
-	incq	%rax
-.copy_msb:
-	loop	.copy_msb_loop
-	ret
-
-.need_copy:
-	cmpq	%rdx, %rdi
-	jne	.copy_msb
-	ret
-SYM_FUNC_END(mpi_sub_x86_64)
-
 /*
  * Operands size specialized implementations of the function above.
  *
@@ -272,7 +162,7 @@ SYM_FUNC_START(mpi_sub_x86_64_5_4)
 	movq	32(%rdx), %r8
 	sbbq	$0, %r8
 	movq	%r8, 32(%rdi)
-	ret
+	RET
 SYM_FUNC_END(mpi_sub_x86_64_5_4)
 
 SYM_FUNC_START(mpi_sub_x86_64_4_4)
@@ -288,7 +178,7 @@ SYM_FUNC_START(mpi_sub_x86_64_4_4)
 	movq	%r9, 8(%rdi)
 	movq	%r10, 16(%rdi)
 	movq	%r11, 24(%rdi)
-	ret
+	RET
 SYM_FUNC_END(mpi_sub_x86_64_4_4)
 
 /**
@@ -321,7 +211,7 @@ SYM_FUNC_START(mpi_sub_mod_p256_x86_64)
 	adcq	%r11, %r9
 	movq	%r8, 16(%rdi)
 	movq	%r9, 24(%rdi)
-	retq
+	RET
 SYM_FUNC_END(mpi_sub_mod_p256_x86_64)
 
 SYM_FUNC_START(mpi_sub_x86_64_3_3)
@@ -334,7 +224,7 @@ SYM_FUNC_START(mpi_sub_x86_64_3_3)
 	movq	%r8, (%rdi)
 	movq	%r9, 8(%rdi)
 	movq	%r10, 16(%rdi)
-	ret
+	RET
 SYM_FUNC_END(mpi_sub_x86_64_3_3)
 
 SYM_FUNC_START(mpi_sub_x86_64_2_2)
@@ -344,7 +234,7 @@ SYM_FUNC_START(mpi_sub_x86_64_2_2)
 	sbbq	8(%rsi), %r9
 	movq	%r8, (%rdi)
 	movq	%r9, 8(%rdi)
-	ret
+	RET
 SYM_FUNC_END(mpi_sub_x86_64_2_2)
 
 
@@ -383,7 +273,7 @@ SYM_FUNC_START(mpi_shift_l_x86_64)
 	movq	(%rsi), %r11
 	shlq	%cl, %r11
 	movq	%r11, (%rdi)
-	ret
+	RET
 SYM_FUNC_END(mpi_shift_l_x86_64)
 
 /**
@@ -410,7 +300,7 @@ SYM_FUNC_START(mpi_shift_l_x86_64_4)
 	movq	%r10, 16(%rdi)
 	movq	%r9, 8(%rdi)
 	movq	%r8, (%rdi)
-	ret
+	RET
 SYM_FUNC_END(mpi_shift_l_x86_64_4)
 
 /**
@@ -440,7 +330,7 @@ SYM_FUNC_START(mpi_shift_l1_mod_p256_x86_64)
 	sbbq	%r10, %r8
 	movq	%rcx, 16(%rdi)
 	movq	%r8, 24(%rdi)
-	retq
+	RET
 SYM_FUNC_END(mpi_shift_l1_mod_p256_x86_64)
 
 
@@ -469,7 +359,7 @@ SYM_FUNC_START(mpi_shift_r_x86_64)
 
 .shr_last:
 	shrq	%cl, (%rdi, %rax, 8)
-	ret
+	RET
 SYM_FUNC_END(mpi_shift_r_x86_64)
 
 /**
@@ -487,7 +377,7 @@ SYM_FUNC_START(mpi_shift_r_x86_64_4)
 	shrdq	%cl, %r9, 8(%rdi)
 	shrdq	%cl, %r10, 16(%rdi)
 	shrq	%cl, 24(%rdi)
-	ret
+	RET
 SYM_FUNC_END(mpi_shift_r_x86_64_4)
 
 
@@ -521,7 +411,7 @@ SYM_FUNC_START(mpi_div2_x86_64_4)
 	movq	%rax, 8(%rdi)
 	movq	%rcx, 16(%rdi)
 	movq	%r8, 24(%rdi)
-	retq
+	RET
 SYM_FUNC_END(mpi_div2_x86_64_4)
 
 
@@ -566,7 +456,7 @@ SYM_FUNC_START_32(mpi_tpl_mod_p256_x86_64)
 	sbbq	%r10, %r8
 	movq	%rcx, 16(%rdi)
 	movq	%r8, 24(%rdi)
-	retq
+	RET
 SYM_FUNC_END(mpi_tpl_mod_p256_x86_64)
 
 
@@ -683,7 +573,7 @@ SYM_FUNC_START_32(mpi_mul_x86_64_4)
 	pop	%r14
 	pop	%r13
 	pop	%r12
-	ret
+	RET
 SYM_FUNC_END(mpi_mul_x86_64_4)
 
 
@@ -777,7 +667,7 @@ SYM_FUNC_START_32(mpi_sqr_x86_64_4)
 	pop	%r13
 	pop	%r12
 	pop	%rbx
-	ret
+	RET
 SYM_FUNC_END(mpi_sqr_x86_64_4)
 
 
@@ -1053,7 +943,7 @@ SYM_FUNC_START_32(ecp_mod_p256_x86_64)
 	popq	%r13
 	popq	%r12
 	popq	%rbx
-	ret
+	RET
 SYM_FUNC_END(ecp_mod_p256_x86_64)
 
 /**
@@ -1079,7 +969,7 @@ SYM_FUNC_START_32(mpi_mul_int_x86_64_4)
 
 	popq	%r13
 	popq	%r12
-	retq
+	RET
 SYM_FUNC_END(mpi_mul_int_x86_64_4)
 
 /*
@@ -1307,7 +1197,7 @@ SYM_FUNC_START_32(mpi_mul_mod_p256_x86_64_4)
 	pop	%r14
 	pop	%r13
 	pop	%r12
-	ret
+	RET
 SYM_FUNC_END(mpi_mul_mod_p256_x86_64_4)
 
 SYM_FUNC_START_32(mpi_sqr_mod_p256_x86_64_4)
@@ -1510,7 +1400,7 @@ SYM_FUNC_START_32(mpi_sqr_mod_p256_x86_64_4)
 	pop	%r13
 	pop	%r12
 	pop	%rbx
-	ret
+	RET
 SYM_FUNC_END(mpi_sqr_mod_p256_x86_64_4)
 
 
@@ -1646,7 +1536,7 @@ SYM_FUNC_START_32(mpi_from_mont_p256_x86_64)
 	popq	%r14
 	popq	%r13
 	popq	%r12
-	retq
+	RET
 SYM_FUNC_END(mpi_from_mont_p256_x86_64)
 
 /**
@@ -1749,7 +1639,7 @@ SYM_FUNC_START_32(mpi_mul_mont_mod_p256_x86_64)
 	popq	%r12
 	popq	%rbp
 	popq	%rbx
-	retq
+	RET
 SYM_FUNC_END(mpi_mul_mont_mod_p256_x86_64)
 
 /**
@@ -1832,5 +1722,5 @@ SYM_FUNC_START_32(mpi_sqr_mont_mod_p256_x86_64)
 	popq	%r14
 	popq	%r13
 	popq	%r12
-	retq
+	RET
 SYM_FUNC_END(mpi_sqr_mont_mod_p256_x86_64)

From 95d8481839eb13144ab313fa650a7bc99dfcd563 Mon Sep 17 00:00:00 2001
From: kingluo <home_king@163.com>
Date: Fri, 17 May 2024 09:56:15 +0800
Subject: [PATCH 06/25] re-enable AVX2, replace ret with RET

---
 Makefile       |  4 ++--
 fw/str.h       |  2 ++
 fw/str_avx2.S  | 60 +++++++++++++++++++++++++-------------------------
 lib/str_simd.S | 14 ++++++------
 4 files changed, 41 insertions(+), 39 deletions(-)

diff --git a/Makefile b/Makefile
index 1f5178d81..b861a6b61 100644
--- a/Makefile
+++ b/Makefile
@@ -112,8 +112,8 @@ ifeq (, $(findstring pse, $(PROC)))
 	ERROR = "1MB huge pages support is required"
 endif
 ifneq (, $(findstring avx2, $(PROC)))
-	#AVX2 = "y"
-	#TFW_CFLAGS += -DAVX2=1
+	AVX2 = "y"
+	TFW_CFLAGS += -DAVX2=1
 endif
 ifneq (, $(findstring bmi2, $(PROC)))
 	BMI2 = "y"
diff --git a/fw/str.h b/fw/str.h
index 136860fd2..d277f12d7 100644
--- a/fw/str.h
+++ b/fw/str.h
@@ -120,6 +120,7 @@ tfw_cstrtolower_wo_avx2(void *dest, const void *src, size_t len)
 		d[i] = tolower(s[i]);
 }
 
+#undef AVX2
 #ifdef AVX2
 /*
  * The functions expect non-ovelapping strings, so use restrict notation in
@@ -181,6 +182,7 @@ tfw_cstricmp_2lc(const char *s1, const char *s2, size_t len)
 	return strncasecmp(s1, s2, len);
 }
 #endif
+#define AVX2 1
 
 /* Buffer size to hold all possible values of unsigned long */
 #define TFW_ULTOA_BUF_SIZ 20
diff --git a/fw/str_avx2.S b/fw/str_avx2.S
index 450f3003b..9a93d10d3 100644
--- a/fw/str_avx2.S
+++ b/fw/str_avx2.S
@@ -349,7 +349,7 @@ SYM_FUNC_START(__tfw_strtolower_avx2)
 	popq	%r12
 	popq	%rbp
 	leaq	-8(%r10), %rsp
-	ret
+	RET
 
 	/*
 	 * Place the processing code before the conditions block to avoid 2
@@ -550,10 +550,10 @@ SYM_FUNC_START(__tfw_stricmp_avx2)
 	xorb	__tfw_lct(%rcx), %al
 	movzbl	%al, %eax
 	orl	%edx, %eax
-	ret
+	RET
 .stricmp_len0:
 	xorl	%eax, %eax
-	ret
+	RET
 .stricmp_len1:
 	xorl	%edx, %edx
 	jmp	.stricmp_do_len1
@@ -640,7 +640,7 @@ SYM_FUNC_START(__tfw_stricmp_avx2)
 	vpmovmskb %xmm0, %eax
 .stricmp_short_nomatch:
 	xorl	$0xffff, %eax
-	ret
+	RET
 
 .stricmp_try128:
 	vpxor	%xmm2, %xmm2, %xmm2
@@ -692,11 +692,11 @@ SYM_FUNC_START(__tfw_stricmp_avx2)
 	vpcmpeqb %xmm1, %xmm0, %xmm0
 	vpmovmskb %xmm0, %eax
 	xorl	$0xffff, %eax
-	ret
+	RET
 
 .stricmp_match:
 	xorl	%eax, %eax
-	ret
+	RET
 
 .stricmp_128:
 	leaq	(%rsi,%rax), %r8
@@ -762,7 +762,7 @@ SYM_FUNC_START(__tfw_stricmp_avx2)
 
 .stricmp_nomatch:
 	movl	$1, %eax
-	ret
+	RET
 
 .stricmp_64:
 	leaq	(%rsi,%rax), %r8
@@ -898,10 +898,10 @@ SYM_FUNC_START(__tfw_stricmp_avx2_2lc)
 	xorb	(%rsi), %al
 	movzbl	%al, %eax
 	orl	%edx, %eax
-	ret
+	RET
 .sic2lc_len0:
 	xorl	%eax, %eax
-	ret
+	RET
 .sic2lc_len1:
 	xorl	%edx, %edx
 	jmp	.sic2lc_do_len1
@@ -966,7 +966,7 @@ SYM_FUNC_START(__tfw_stricmp_avx2_2lc)
 	vpmovmskb %xmm0, %eax
 .sic2lc_short_nomatch:
 	xorl	$0xffff, %eax
-	ret
+	RET
 
 .sic2lc_try128:
 	xorl	%eax, %eax
@@ -1012,7 +1012,7 @@ SYM_FUNC_START(__tfw_stricmp_avx2_2lc)
 	vpmovmskb %xmm0, %eax
 .sic2lc_tail_maybe_match:
 	xorl	$0xffff, %eax
-	ret
+	RET
 .sic2lc_tail_big:
 	addq	%r8, %rsi
 	cmpq	$15, %rcx
@@ -1044,7 +1044,7 @@ SYM_FUNC_START(__tfw_stricmp_avx2_2lc)
 
 .sic2lc_match:
 	xorl	%eax, %eax
-	ret
+	RET
 
 .sic2lc_128:
 	leaq	(%rsi,%rax), %r8
@@ -1094,7 +1094,7 @@ SYM_FUNC_START(__tfw_stricmp_avx2_2lc)
 
 .sic2lc_nomatch:
 	movl	$1, %eax
-	ret
+	RET
 
 .sic2lc_64:
 	leaq	(%rsi,%rax), %r8
@@ -1184,16 +1184,16 @@ SYM_FUNC_START(__tfw_match_custom)
 	movl	$2, %ecx
 	cmove	%rcx, %rax
 .mcust_small_len_done:
-	ret
+	RET
 .mcust_len_1_or_0:
 	cmpq	$1, %rsi
 	jne	.mcust_len_0
 	movzbl	(%rdi), %eax
 	movzbl	(%rdx,%rax), %eax
-	ret
+	RET
 .mcust_len_0:
 	xorl	%eax, %eax
-	ret
+	RET
 .mcust_len_3:
 	xorl	%r8d, %r8d
 	jmp	.mcust_cmp_long_len
@@ -1430,7 +1430,7 @@ SYM_FUNC_START(__tfw_match_custom)
 	popq	%r10
 	popq	%rbp
 	leaq	-8(%r10), %rsp
-	ret
+	RET
 .mcust_got_mismatch_tail:
 	subq	%rdi, %rax
 	testb	%r11b, %r11b
@@ -1507,16 +1507,16 @@ SYM_CODE_START(__tfw_strspn_simd)
 	movl	$2, %ecx
 	cmove	%rcx, %rax
 .strspn_small_len_done:
-	ret
+	RET
 .strspn_len_1or0:
 	cmpq	$1, %rsi
 	jne	.strspn_len_0
 	movzbl	(%rdi), %eax
 	movzbl	(%rdx,%rax), %eax
-	ret
+	RET
 .strspn_len_0:
 	xorl	%eax, %eax
-	ret
+	RET
 .strspn_len_3:
 	xorl	%r8d, %r8d
 	jmp	.strspn_do_len_3
@@ -1723,7 +1723,7 @@ SYM_CODE_START(__tfw_strspn_simd)
 	popq	%r10
 	popq	%rbp
 	leaq	-8(%r10), %rsp
-	ret
+	RET
 .strspn_tail_loop_nomatch:
 	subq	%rdi, %rax
 	testb	%r11b, %r11b
@@ -1801,18 +1801,18 @@ SYM_CODE_END(__tfw_strspn_simd) /* for local sibling calls only */
 	testl	%edx, %edx
 	movl	$2, %edx
 	cmove	%rdx, %rax
-	ret
+	RET
 .ctxt_\NAME\()_len_10:
 	cmpq	$1, %rsi
 	jne	.ctxt_\NAME\()_len_0
 	movzbl	(%rdi), %eax
 	movzbl	\NAME(%rax), %eax
-	ret
+	RET
 .ctxt_\NAME\()_len_0:
 	xorl	%eax, %eax
-	ret
+	RET
 .ctxt_\NAME\()_match1:
-	rep ret
+	RET
 .ctxt_\NAME\()_len_3:
 	xorl	%esi, %esi
 	jmp	.ctxt_\NAME\()_do_len_3
@@ -1997,7 +1997,7 @@ SYM_CODE_END(__tfw_strspn_simd) /* for local sibling calls only */
 .ctxt_\NAME\()_mistmatch:
 	subq	%rdi, %rax
 	addq	%rcx, %rax
-	ret
+	RET
 
 	/* The tail loops processes string lengths in range [4, 15]. */
 .ctxt_\NAME\()_tail_loop:
@@ -2032,7 +2032,7 @@ SYM_CODE_END(__tfw_strspn_simd) /* for local sibling calls only */
 	xorl	%eax, %eax
 .ctxt_\NAME\()_add_len:
 	addq	%rsi, %rax
-	ret
+	RET
 .ctxt_\NAME\()_end:
 	subq	%rdi, %rax
 	testb	%r10b, %r10b
@@ -2042,10 +2042,10 @@ SYM_CODE_END(__tfw_strspn_simd) /* for local sibling calls only */
 	cmovne	%r9d, %r8d
 	movl	%r8d, %r8d
 	leaq	2(%rax,%r8), %rax
-	ret
+	RET
 .ctxt_\NAME\()_mismatch_1:
 	addq	%rdx, %rax
-	ret
+	RET
 .ctxt_\NAME\()_tail_1:
 	movzbl	(%rax), %edx
 	subq	%rdi, %rax
@@ -2065,7 +2065,7 @@ SYM_CODE_END(__tfw_strspn_simd) /* for local sibling calls only */
 	testb	%cl, \NAME(%rax)
 	je	.ctxt_\NAME\()_add_tail_len
 	leaq	2(%rsi,%rdx), %rax
-	ret
+	RET
 .ctxt_\NAME\()_tail_3:
 	movzbl	2(%rax), %edx
 	movzbl	\NAME(%rdx), %edx
diff --git a/lib/str_simd.S b/lib/str_simd.S
index b57f0635a..6b179efab 100644
--- a/lib/str_simd.S
+++ b/lib/str_simd.S
@@ -59,7 +59,7 @@ SYM_FUNC_START(__memcpy_fast)
 	andl	$1, %edx
 	jne	.L1cpy
 	/* Don't clean the registers w/ vzeroupper. */
-	ret
+	RET
 	.p2align 4
 .Lrepeat128cpy:
 	movq	%r8, %rsi
@@ -132,7 +132,7 @@ SYM_FUNC_START(__memcpy_fast)
 	movzbl	(%rcx), %edx
 	movb	%dl, (%rax)
 .Lcpy_ret:
-	ret
+	RET
 SYM_FUNC_END(__memcpy_fast)
 
 SYM_FUNC_START(__memcmp_fast)
@@ -163,7 +163,7 @@ SYM_FUNC_START(__memcmp_fast)
 	xorl	%eax, %eax
 	andl	$1, %edx
 	jne	.L1cmp
-	ret
+	RET
 	.p2align 4
 .L128cmp:
 	vlddqu	-96(%rcx), %ymm0
@@ -198,7 +198,7 @@ SYM_FUNC_START(__memcmp_fast)
 	je	.L128cmp
 .Lret_neq:
 	movl	$1, %eax
-	ret
+	RET
 .L64cmp:
 	vlddqu	(%rcx), %ymm0
 	vlddqu	(%rsi), %ymm1
@@ -264,7 +264,7 @@ SYM_FUNC_START(__memcmp_fast)
 	setne	%al
 	movzbl	%al, %eax
 .Lcmp_ret:
-	ret
+	RET
 SYM_FUNC_END(__memcmp_fast)
 
 SYM_FUNC_START(__bzero_fast)
@@ -297,7 +297,7 @@ SYM_FUNC_START(__bzero_fast)
 	andl	$1, %esi
 	jne	.L1zer
 	/* Don't clean the registers w/ vzeroupper. */
-	ret
+	RET
 	.p2align 4
 .L128zer:
 	vmovdqu	%ymm0, (%rdx)
@@ -349,5 +349,5 @@ SYM_FUNC_START(__bzero_fast)
 .L1zer:
 	movb	$0, (%rax)
 .Lbz_ret:
-	ret
+	RET
 SYM_FUNC_END(__bzero_fast)

From 86e304311107ec58a052323f5210e881dabf73c6 Mon Sep 17 00:00:00 2001
From: kingluo <home_king@163.com>
Date: Fri, 24 May 2024 00:07:45 +0800
Subject: [PATCH 07/25] disable AVX2 for memcpy/memcmp/bzero temporarily to
 avoid userspace segfault

---
 lib/main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/main.c b/lib/main.c
index 59a44567d..b02baa052 100644
--- a/lib/main.c
+++ b/lib/main.c
@@ -19,6 +19,7 @@
  */
 #include <linux/module.h>
 #include <linux/string.h>
+#undef AVX2
 
 MODULE_AUTHOR("Tempesta Technologies, INC");
 MODULE_VERSION("0.1.1");

From e6256ea6c88dc0905565d3852b22977c32ee7490 Mon Sep 17 00:00:00 2001
From: kingluo <home_king@163.com>
Date: Fri, 24 May 2024 00:08:25 +0800
Subject: [PATCH 08/25] disable skb->head_frag assertion temporarily to avoid
 panic

---
 fw/ss_skb.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fw/ss_skb.c b/fw/ss_skb.c
index 0c0766eec..2aca7ca87 100644
--- a/fw/ss_skb.c
+++ b/fw/ss_skb.c
@@ -331,7 +331,7 @@ __split_linear_data(struct sk_buff *skb_head, struct sk_buff *skb, char *pspt,
 
 	T_DBG3("[%d]: %s: skb [%p] pspt [%p] len [%d] tail_len [%d]\n",
 	       smp_processor_id(), __func__, skb, pspt, len, tail_len);
-	BUG_ON(!skb->head_frag);
+	//BUG_ON(!skb->head_frag);
 	BUG_ON(tail_len <= 0);
 	BUG_ON(!(alloc | tail_len));
 	BUG_ON(-len > tail_len);
@@ -1374,7 +1374,7 @@ ss_skb_queue_coalesce_tail(struct sk_buff **skb_head, const struct sk_buff *skb)
 	unsigned int headlen = skb_headlen(skb);
 
 	if (headlen) {
-		BUG_ON(!skb->head_frag);
+		//BUG_ON(!skb->head_frag);
 		head_frag.bv_len = headlen;
 		head_frag.bv_page = virt_to_page(skb->head);
 		head_frag.bv_offset = skb->data -

From 7c10e6ae3e7c750b0186df6032caab9115e4f6d9 Mon Sep 17 00:00:00 2001
From: kingluo <home_king@163.com>
Date: Sun, 26 May 2024 02:08:16 +0800
Subject: [PATCH 09/25] fix TfwGState->curr type: ensure correct frang index

---
 fw/gfsm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fw/gfsm.h b/fw/gfsm.h
index 567759ae6..4dc371c69 100644
--- a/fw/gfsm.h
+++ b/fw/gfsm.h
@@ -164,7 +164,7 @@ typedef struct {
  * @states	- all FSM states, i.e. the FSM states set;
  */
 typedef struct {
-	char		curr;
+	int		curr;
 	void		*obj;
 	unsigned short	states[TFW_GFSM_FSM_NUM];
 } TfwGState;

From 3d5730e078fea9b595e5444420abbe7ced7dfede Mon Sep 17 00:00:00 2001
From: kingluo <home_king@163.com>
Date: Mon, 27 May 2024 16:55:26 +0800
Subject: [PATCH 10/25] filter out SO_EE_ORIGIN_TIMESTAMPING in
 sk->sk_error_queue

---
 fw/sock.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fw/sock.c b/fw/sock.c
index ff89d61cf..3ecb08c04 100644
--- a/fw/sock.c
+++ b/fw/sock.c
@@ -25,6 +25,7 @@
 #include <net/protocol.h>
 #include <net/inet_common.h>
 #include <net/ip6_route.h>
+#include <linux/errqueue.h>
 
 #undef DEBUG
 #if DBG_SS > 0
@@ -940,11 +941,16 @@ ss_tcp_data_ready(struct sock *sk)
 	TFW_VALIDATE_SK_LOCK_OWNER(sk);
 
 	if (!skb_queue_empty(&sk->sk_error_queue)) {
+		struct sk_buff* skb = sk->sk_error_queue.next;
 		/*
 		 * Error packet received.
 		 * See sock_queue_err_skb() in linux/net/core/skbuff.c.
 		 */
-		T_ERR("error data in socket %p\n", sk);
+		if (SKB_EXT_ERR(skb)->ee.ee_errno != ENOMSG &&
+		    SKB_EXT_ERR(skb)->ee.ee_origin !=
+		    SO_EE_ORIGIN_TIMESTAMPING) {
+			T_ERR("error data in socket %p\n", sk);
+		}
 		return;
 	}
 

From 89d2f30405fe151df61b2cc4d3cc58d4fef56e25 Mon Sep 17 00:00:00 2001
From: kingluo <home_king@163.com>
Date: Tue, 4 Jun 2024 17:18:27 +0800
Subject: [PATCH 11/25] enable fpu in the whole softirq ctx

---
 fw/apm.c              | 4 ++++
 fw/cache.c            | 2 ++
 fw/http_sched_ratio.c | 4 ++++
 fw/sock.c             | 6 ++++++
 fw/sock_clnt.c        | 2 ++
 fw/sock_srv.c         | 4 ++++
 fw/str.h              | 4 +---
 lib/main.c            | 1 -
 tls/tls_ticket.c      | 2 ++
 9 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/fw/apm.c b/fw/apm.c
index 0a1b54d49..2a8caf3b7 100644
--- a/fw/apm.c
+++ b/fw/apm.c
@@ -1006,6 +1006,8 @@ tfw_apm_prcntl_tmfn(struct timer_list *t)
 	TfwApmRBuf *rbuf = &data->rbuf;
 	TfwApmRBEnt *rbent = rbuf->rbent;
 
+	kernel_fpu_begin();
+
 	/*
 	 * Increment the counter and make the updates use the other array
 	 * of the two that are available. In the meanwhile, use the array
@@ -1045,6 +1047,8 @@ tfw_apm_hm_timer_cb(struct timer_list *t)
 	TfwApmHM *hm = READ_ONCE(hmctl->hm);
 	unsigned long now;
 
+	kernel_fpu_begin();
+
 	BUG_ON(!hm);
 	if (!atomic64_read(&hmctl->rcount))
 		tfw_http_hm_srv_send(srv, hm->req, hm->reqsz);
diff --git a/fw/cache.c b/fw/cache.c
index de1a652ff..f123bdbd2 100644
--- a/fw/cache.c
+++ b/fw/cache.c
@@ -3099,6 +3099,8 @@ tfw_wq_tasklet(unsigned long data)
 	TfwRBQueue *wq = &ct->wq;
 	TfwCWork cw;
 
+	kernel_fpu_begin();
+
 	while (!tfw_wq_pop(wq, &cw))
 		tfw_cache_do_action(cw.msg, cw.action);
 
diff --git a/fw/http_sched_ratio.c b/fw/http_sched_ratio.c
index 11495921b..06553dbc7 100644
--- a/fw/http_sched_ratio.c
+++ b/fw/http_sched_ratio.c
@@ -680,6 +680,8 @@ tfw_sched_ratio_dynamic_tmfn(struct timer_list *t)
 {
 	TfwRatio *r = from_timer(r, t, timer);
 
+	kernel_fpu_begin();
+
 	tfw_sched_ratio_calc_tmfn(r, tfw_sched_ratio_calc_dynamic);
 }
 
@@ -691,6 +693,8 @@ tfw_sched_ratio_predict_tmfn(struct timer_list *t)
 {
 	TfwRatio *r = from_timer(r, t, timer);
 
+	kernel_fpu_begin();
+
 	tfw_sched_ratio_calc_tmfn(r, tfw_sched_ratio_calc_predict);
 }
 
diff --git a/fw/sock.c b/fw/sock.c
index 3ecb08c04..8d361f7be 100644
--- a/fw/sock.c
+++ b/fw/sock.c
@@ -196,6 +196,7 @@ ss_active_guard_exit(unsigned long val)
 static void
 ss_conn_drop_guard_exit(struct sock *sk)
 {
+	kernel_fpu_begin();
 	SS_CONN_TYPE(sk) &= ~(Conn_Closing | Conn_Shutdown);
 	SS_CALL(connection_drop, sk);
 	if (sk->sk_security)
@@ -935,6 +936,8 @@ ss_tcp_data_ready(struct sock *sk)
 	int (*action)(struct sock *sk, int flags);
 	bool was_stopped = (SS_CONN_TYPE(sk) & Conn_Stop);
 
+	kernel_fpu_begin();
+
 	T_DBG3("[%d]: %s: sk=%p state=%s\n",
 	       smp_processor_id(), __func__, sk, ss_statename[sk->sk_state]);
 	assert_spin_locked(&sk->sk_lock.slock);
@@ -1021,6 +1024,7 @@ ss_tcp_data_ready(struct sock *sk)
 static void
 ss_tcp_state_change(struct sock *sk)
 {
+	kernel_fpu_begin();
 	T_DBG3("[%d]: %s: sk=%p state=%s\n",
 	       smp_processor_id(), __func__, sk, ss_statename[sk->sk_state]);
 	ss_sk_incoming_cpu_update(sk);
@@ -1454,6 +1458,8 @@ ss_tx_action(void)
 	TfwRBQueue *wq = this_cpu_ptr(&si_wq);
 	long ticket = 0;
 
+	kernel_fpu_begin();
+
 	/*
 	 * @budget limits the loop to prevent live lock on constantly arriving
 	 * new items. We use some small integer as a lower bound to catch just
diff --git a/fw/sock_clnt.c b/fw/sock_clnt.c
index cd00c5ab7..b5ab518e8 100644
--- a/fw/sock_clnt.c
+++ b/fw/sock_clnt.c
@@ -70,6 +70,8 @@ tfw_sock_cli_keepalive_timer_cb(struct timer_list *t)
 {
 	TfwCliConn *cli_conn = from_timer(cli_conn, t, timer);
 
+	kernel_fpu_begin();
+
 	T_DBG("Client timeout end\n");
 
 	/*
diff --git a/fw/sock_srv.c b/fw/sock_srv.c
index f6c3a9df2..85a44e8ac 100644
--- a/fw/sock_srv.c
+++ b/fw/sock_srv.c
@@ -300,6 +300,8 @@ tfw_sock_srv_connect_retry_timer_cb(struct timer_list *t)
 {
 	TfwSrvConn *srv_conn = from_timer(srv_conn, t, timer);
 
+	kernel_fpu_begin();
+
 	/* A new socket is created for each connect attempt. */
 	tfw_sock_srv_connect_try(srv_conn);
 }
@@ -816,6 +818,8 @@ tfw_sock_srv_grace_shutdown_cb(struct timer_list *t)
 {
 	TfwServer *srv = from_timer(srv, t, gs_timer);
 
+	kernel_fpu_begin();
+
 	tfw_sock_srv_grace_stop(srv);
 }
 
diff --git a/fw/str.h b/fw/str.h
index d277f12d7..2a53b8176 100644
--- a/fw/str.h
+++ b/fw/str.h
@@ -120,8 +120,7 @@ tfw_cstrtolower_wo_avx2(void *dest, const void *src, size_t len)
 		d[i] = tolower(s[i]);
 }
 
-#undef AVX2
-#ifdef AVX2
+#if 0
 /*
  * The functions expect non-ovelapping strings, so use restrict notation in
  * the declarations just as a specification.
@@ -182,7 +181,6 @@ tfw_cstricmp_2lc(const char *s1, const char *s2, size_t len)
 	return strncasecmp(s1, s2, len);
 }
 #endif
-#define AVX2 1
 
 /* Buffer size to hold all possible values of unsigned long */
 #define TFW_ULTOA_BUF_SIZ 20
diff --git a/lib/main.c b/lib/main.c
index b02baa052..59a44567d 100644
--- a/lib/main.c
+++ b/lib/main.c
@@ -19,7 +19,6 @@
  */
 #include <linux/module.h>
 #include <linux/string.h>
-#undef AVX2
 
 MODULE_AUTHOR("Tempesta Technologies, INC");
 MODULE_VERSION("0.1.1");
diff --git a/tls/tls_ticket.c b/tls/tls_ticket.c
index bd4236878..07c55c82b 100644
--- a/tls/tls_ticket.c
+++ b/tls/tls_ticket.c
@@ -179,6 +179,8 @@ ttls_ticket_rotate_keys(struct timer_list *t)
 	TlsTicketPeerCfg *tcfg = from_timer(tcfg, t, timer);
 	unsigned long secs;
 
+	kernel_fpu_begin();
+
 	T_DBG("TLS: Rotate keys for ticket configuration [%pK]\n", tcfg);
 	if (ttls_ticket_update_keys(tcfg))
 		T_ERR("TLS: Can't rotate keys for ticket configuration [%pK]\n",

From 2e4a18dbad4729ab79f4f47a99f281b2a73ed916 Mon Sep 17 00:00:00 2001
From: kingluo <home_king@163.com>
Date: Tue, 4 Jun 2024 22:36:30 +0800
Subject: [PATCH 12/25] continue sk->sk_receive_queue processing in case of
 SO_EE_ORIGIN_TIMESTAMPING

---
 fw/sock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fw/sock.c b/fw/sock.c
index 8d361f7be..259504bcd 100644
--- a/fw/sock.c
+++ b/fw/sock.c
@@ -953,8 +953,8 @@ ss_tcp_data_ready(struct sock *sk)
 		    SKB_EXT_ERR(skb)->ee.ee_origin !=
 		    SO_EE_ORIGIN_TIMESTAMPING) {
 			T_ERR("error data in socket %p\n", sk);
+			return;
 		}
-		return;
 	}
 
 	if (skb_queue_empty(&sk->sk_receive_queue)) {

From f8b1e532724c490ce9e868ea1e2e56fdf4d300e2 Mon Sep 17 00:00:00 2001
From: kingluo <home_king@163.com>
Date: Thu, 6 Jun 2024 21:36:42 +0800
Subject: [PATCH 13/25] fix assembly functions: RET and endbr64-jump-table

Problems:

1. In the new kernel, assembly functions uniformly return from
   `__x86_return_thunk`. However, our assembly code uses the original
   `ret` instruction, so objtool in the kernel will notice this is a naked
   return during compilation.

2. `SYM_FUNC_START` in the new kernel will add endbr64 to the head of
   the assembly function, and all indirect jumps to ENDBR instructions,
   that is, the code snippet within the same function, will fail, but we
   use jump tables in the assembly function to perform indirect jumps. It
   will raise CET exception:
   https://en.wikipedia.org/wiki/X86_instruction_listings#Added_with_Intel_CET).

Solutions:

1. Substitute the `ret` with `RET`, a macro in the new kernel to
   ensure the correct return.

2. `notrack jmp` and enable notrack in CPU setting:
   `wrmsrl(MSR_IA32_S_CET, CET_ENDBR_EN | CET_NO_TRACK_EN)`

As an aside, interestingly, if a user-mode C program uses a switch
statement that meets the conditions for generating a jump table (gcc
uses `-fcf-protection=full` by default), the generated jump table will
use a `jmp` with the `notrack` prefix, and IBT will be marked as `true`
in the `.note.gnu.property` section of the compiled elf file, so that
the `NO_TRACK_EN` of the `MSR` will be set to `true` in user mode when
the kernel is loaded. So user mode can use `notrack` to bypass CET
without caring about setting or not setting `NO_TRACK_EN`.
---
 fw/str.h            |   2 +-
 fw/str_avx2.S       |   6 +-
 tls/bignum.c        | 270 +-------------------------------------------
 tls/bignum.h        |  10 +-
 tls/bignum_x86-64.S | 110 ++++++++++++++++++
 5 files changed, 121 insertions(+), 277 deletions(-)

diff --git a/fw/str.h b/fw/str.h
index 2a53b8176..136860fd2 100644
--- a/fw/str.h
+++ b/fw/str.h
@@ -120,7 +120,7 @@ tfw_cstrtolower_wo_avx2(void *dest, const void *src, size_t len)
 		d[i] = tolower(s[i]);
 }
 
-#if 0
+#ifdef AVX2
 /*
  * The functions expect non-ovelapping strings, so use restrict notation in
  * the declarations just as a specification.
diff --git a/fw/str_avx2.S b/fw/str_avx2.S
index 9a93d10d3..02e3e0f2a 100644
--- a/fw/str_avx2.S
+++ b/fw/str_avx2.S
@@ -311,7 +311,7 @@ SYM_FUNC_START(__tfw_strtolower_avx2)
 	 * a constant and there is no speculation required for the attack.
 	 */
 	ANNOTATE_RETPOLINE_SAFE
-	jmpq	*%rax
+	notrack jmpq	*%rax
 .section .rodata
 .align	8
 .str2low_switch:
@@ -481,7 +481,7 @@ SYM_FUNC_START(__tfw_stricmp_avx2)
 	/* Process short strings below 8 bytes in length. */
 	movq	.stricmp_switch(,%rdx,8), %rax
 	ANNOTATE_RETPOLINE_SAFE /* constant bounds check */
-	jmpq	*%rax
+	notrack jmpq	*%rax
 .section .rodata
 .align 8
 .stricmp_switch:
@@ -838,7 +838,7 @@ SYM_FUNC_START(__tfw_stricmp_avx2_2lc)
 
 	movq	.sic2lc_switch(,%rdx,8), %rax
 	ANNOTATE_RETPOLINE_SAFE /* constant bounds check */
-	jmpq	*%rax
+	notrack jmpq	*%rax
 .section	.rodata
 .align 8
 .sic2lc_switch:
diff --git a/tls/bignum.c b/tls/bignum.c
index 73fb9fe22..d9c25a3e2 100644
--- a/tls/bignum.c
+++ b/tls/bignum.c
@@ -276,46 +276,6 @@ ttls_mpi_size(const TlsMpi *X)
  * All the shifts for more than 64 bits are for integer number of limbs,
  * so the straightforward 2n algorithm is fine to make the each case simpler.
  */
-int
-ttls_mpi_shift_ll(TlsMpi *X, size_t count)
-{
-	size_t v0, t1, old_used = X->used, i = ttls_mpi_bitlen(X);
-	unsigned long r0 = 0, r1, *p = MPI_P(X);
-
-	if (unlikely(!i))
-		return 0;
-
-	v0 = count >> BSHIFT;
-	t1 = count & BMASK;
-	i += count;
-
-	if (WARN_ON_ONCE((X->limbs << BSHIFT) < i))
-		return -ENOSPC;
-
-	X->used = BITS_TO_LIMBS(i);
-	if (old_used < X->used)
-		bzero_fast(p + old_used, (X->used - old_used) * CIL);
-
-	/* Shift by count / limb_size. */
-	if (v0 > 0) {
-		for (i = X->used; i > v0; i--)
-			p[i - 1] = p[i - v0 - 1];
-		for ( ; i > 0; i--)
-			p[i - 1] = 0;
-	}
-
-	/* shift by count % limb_size. */
-	if (t1 > 0) {
-		for (i = v0; i < X->used; i++) {
-			r1 = p[i] >> (BIL - t1);
-			p[i] <<= t1;
-			p[i] |= r0;
-			r0 = r1;
-		}
-	}
-
-	return 0;
-}
 void
 ttls_mpi_shift_l(TlsMpi *X, const TlsMpi *A, size_t count)
 {
@@ -371,50 +331,6 @@ ttls_mpi_shift_l(TlsMpi *X, const TlsMpi *A, size_t count)
  * All the shifts for more than 64 bits are for integer number of limbs,
  * so the straightforward 2n algorithm is fine to make the each case simpler.
  */
-int
-ttls_mpi_shift_r(TlsMpi *X, size_t count)
-{
-	size_t i, v0, v1;
-	unsigned long r0 = 0, r1;
-
-	if (unlikely(!X->used || !MPI_P(X)[X->used - 1])) {
-		WARN_ON_ONCE(X->used > 1);
-		return 0;
-	}
-
-	v0 = count >> BSHIFT;
-	v1 = count & BMASK;
-
-	if (v0 > X->used || (v0 == X->used && v1 > 0)) {
-		ttls_mpi_lset(X, 0);
-		return 0;
-	}
-
-	/*
-	 * Shift by count / limb_size - remove least significant limbs.
-	 * There could be garbage after last used limb, so be careful.
-	 */
-	if (v0 > 0) {
-		X->used -= v0;
-		for (i = 0; i < X->used; i++)
-			MPI_P(X)[i] = MPI_P(X)[i + v0];
-	}
-
-	/* Shift by count % limb_size. */
-	if (v1 > 0) {
-		for (i = X->used; i > 0; i--) {
-			r1 = MPI_P(X)[i - 1] << (BIL - v1);
-			MPI_P(X)[i - 1] >>= v1;
-			MPI_P(X)[i - 1] |= r0;
-			r0 = r1;
-		}
-		if (!MPI_P(X)[X->used - 1])
-			--X->used;
-	}
-
-	return 0;
-}
-#if 0
 void
 ttls_mpi_shift_r(TlsMpi *X, size_t count)
 {
@@ -469,7 +385,6 @@ ttls_mpi_shift_r(TlsMpi *X, size_t count)
 		X->s = 1;
 	bzero_fast(x + X->used, (X->limbs - X->used) * CIL);
 }
-#endif
 
 #if DBG_TLS
 /**
@@ -673,58 +588,6 @@ ttls_mpi_cmp_int(const TlsMpi *X, long z)
  *
  * @A and @B must be different, but either of them can accept the result @X.
  */
-#if 1
-int
-ttls_mpi_add_abs(TlsMpi *X, const TlsMpi *A, const TlsMpi *B)
-{
-	size_t i;
-	unsigned long *a, *b, *x, c = 0;
-
-	BUG_ON(A == B);
-	if (X == B) {
-		const TlsMpi *T = A;
-		A = X;
-		B = T;
-	}
-
-	/* X should always be positive as a result of unsigned additions. */
-	X->s = 1;
-
-	if (WARN_ON_ONCE(X->limbs < max_t(unsigned short, A->used, B->used)))
-		return -ENOSPC;
-	X->used = A->used;
-
-	a = MPI_P(A);
-	b = MPI_P(B);
-	x = MPI_P(X);
-	/* TODO #1064 move out condition from under the loop. */
-	for (i = 0; i < B->used; i++, a++, b++, x++) {
-		if (i == X->used) {
-			++X->used;
-			*x = c;
-		} else {
-			*x = *a + c;
-		}
-		c = *x < c;
-		*x += *b;
-		c += *x < *b;
-	}
-	for ( ; c; i++, a++, x++) {
-		BUG_ON(i >= X->limbs);
-		if (i == X->used) {
-			++X->used;
-			*x = c;
-		} else {
-			*x = *a + c;
-		}
-		c = *x < c;
-	}
-	if (X != A && X->used > i)
-		memcpy_fast(x, a, (X->used - i) * CIL);
-
-	return 0;
-}
-#else
 void
 ttls_mpi_add_abs(TlsMpi *X, const TlsMpi *A, const TlsMpi *B)
 {
@@ -748,55 +611,11 @@ ttls_mpi_add_abs(TlsMpi *X, const TlsMpi *A, const TlsMpi *B)
 	/* X should always be positive as a result of unsigned additions. */
 	X->s = 1;
 }
-#endif
-
-static void
-__mpi_sub(unsigned long *a, size_t a_len, unsigned long *b, size_t b_len,
-	  unsigned long *r)
-{
-	unsigned long c = 0, z, b_tmp, *b_end = b + b_len, *a_end = a + a_len;
-
-	BUG_ON(a_len < b_len);
 
-	for ( ; b < b_end; a++, b++, r++) {
-		z = *a < c;
-		b_tmp = *b;
-		*r = *a - c;
-		c = (*r < b_tmp) + z;
-		*r -= b_tmp;
-	}
-	while (c) {
-		z = *a < c;
-		*r = *a - c;
-		c = z;
-		a++;
-		r++;
-	}
-	BUG_ON(a > a_end);
-	memcpy_fast(r, a, (a_end - a) * CIL);
-}
 /**
  * Unsigned subtraction: X = |A| - |B|.
  * @X may reference either @A or @B.
  */
-#if 1
-int
-ttls_mpi_sub_abs(TlsMpi *X, const TlsMpi *A, const TlsMpi *B)
-{
-	if (ttls_mpi_cmp_abs(A, B) < 0)
-		return -EINVAL;
-
-	ttls_mpi_alloc(X, A->used);
-
-	__mpi_sub(MPI_P(A), A->used, MPI_P(B), B->used, MPI_P(X));
-
-	/* X should always be positive as a result of unsigned subtractions. */
-	X->s = 1;
-	mpi_fixup_used(X, A->used);
-
-	return 0;
-}
-#else
 void
 ttls_mpi_sub_abs(TlsMpi *X, const TlsMpi *A, const TlsMpi *B)
 {
@@ -826,15 +645,13 @@ ttls_mpi_sub_abs(TlsMpi *X, const TlsMpi *A, const TlsMpi *B)
 		*x = *a - *b;
 	}
 	else {
-		//mpi_sub_x86_64(x, b, a, b_sz, a_sz);
-	__mpi_sub(MPI_P(A), A->used, MPI_P(B), B->used, MPI_P(X));
+		mpi_sub_x86_64(x, b, a, b_sz, a_sz);
 	}
 
 	/* X should always be positive as a result of unsigned subtractions. */
 	X->s = 1;
 	mpi_fixup_used(X, a_sz);
 }
-#endif
 
 /**
  * Signed addition: X = A + B
@@ -868,31 +685,6 @@ ttls_mpi_add_mpi(TlsMpi *X, const TlsMpi *A, const TlsMpi *B)
 /**
  * Signed subtraction: X = A - B
  */
-#if 1
-int
-ttls_mpi_sub_mpi(TlsMpi *X, const TlsMpi *A, const TlsMpi *B)
-{
-	int r, s = A->s;
-
-	if (A->s * B->s > 0) {
-		if (ttls_mpi_cmp_abs(A, B) >= 0) {
-			if ((r = ttls_mpi_sub_abs(X, A, B)))
-				return r;
-			X->s = s;
-		} else {
-			if ((r = ttls_mpi_sub_abs(X, B, A)))
-				return r;
-			X->s = -s;
-		}
-	} else {
-		if ((r = ttls_mpi_add_abs(X, A, B)))
-			return r;
-		X->s = s;
-	}
-
-	return 0;
-}
-#else
 void
 ttls_mpi_sub_mpi(TlsMpi *X, const TlsMpi *A, const TlsMpi *B)
 {
@@ -918,7 +710,6 @@ ttls_mpi_sub_mpi(TlsMpi *X, const TlsMpi *A, const TlsMpi *B)
 		X->s = s;
 	}
 }
-#endif
 
 /**
  * Signed addition: X = A + b
@@ -935,18 +726,6 @@ ttls_mpi_add_int(TlsMpi *X, const TlsMpi *A, long b)
 /**
  * Signed subtraction: X = A - b
  */
-#if 1
-int
-ttls_mpi_sub_int(TlsMpi *X, const TlsMpi *A, long b)
-{
-	DECLARE_MPI_AUTO(_B, 1);
-	MPI_P(&_B)[0] = (b < 0) ? -b : b;
-	_B.s = (b < 0) ? -1 : 1;
-	_B.limbs = _B.used = 1;
-
-	return ttls_mpi_sub_mpi(X, A, &_B);
-}
-#else
 void
 ttls_mpi_sub_int(TlsMpi *X, const TlsMpi *A, long b)
 {
@@ -966,7 +745,6 @@ ttls_mpi_sub_int(TlsMpi *X, const TlsMpi *A, long b)
 		X->s = -1;
 	}
 }
-#endif
 
 #define MULADDC_INIT							\
 	asm(	"xorq	%%r8, %%r8	\n\t"
@@ -1343,49 +1121,6 @@ __mpi_montg_init(unsigned long *mm, const TlsMpi *N)
  * TODO #1335: this is used for modular exponentiation only, so repalce it with
  * an adequate assembly implementation for the RSA handshakes.
  */
-#if 1
-static int
-__mpi_montmul(TlsMpi *A, const TlsMpi *B, const TlsMpi *N, unsigned long mm,
-	      TlsMpi *T)
-{
-	size_t i, n, m;
-	unsigned long u0, u1, *d;
-
-	BUG_ON(T->limbs < N->used + 1);
-	bzero_fast(MPI_P(T), T->limbs * CIL);
-
-	d = MPI_P(T);
-	n = N->used;
-	m = (B->used < n) ? B->used : n;
-
-	for (i = 0; i < n; i++) {
-		/* T = (T + u0*B + u1*N) / 2^BIL */
-		u0 = MPI_P(A)[i];
-		u1 = (d[0] + u0 * MPI_P(B)[0]) * mm;
-
-		__mpi_mul(m, MPI_P(B), d, u0);
-		__mpi_mul(n, MPI_P(N), d, u1);
-
-		*d++ = u0;
-		d[n + 1] = 0;
-	}
-	mpi_fixup_used(T, T->limbs);
-
-	memcpy_fast(MPI_P(A), d, (n + 1) * CIL);
-	mpi_fixup_used(A, n + 1);
-
-	if (ttls_mpi_cmp_abs(A, N) >= 0) {
-		__mpi_sub(MPI_P(A), A->used, MPI_P(N), N->used, MPI_P(A));
-		mpi_fixup_used(A, A->used);
-	} else {
-		/* Prevent timing attacks. */
-		__mpi_sub(MPI_P(T), T->used, MPI_P(A), A->used, MPI_P(T));
-		mpi_fixup_used(T, T->used);
-	}
-
-	return 0;
-}
-#else
 static int
 __mpi_montmul(TlsMpi *A, const TlsMpi *B, const TlsMpi *N, unsigned long mm,
 	      TlsMpi *T)
@@ -1427,7 +1162,6 @@ __mpi_montmul(TlsMpi *A, const TlsMpi *B, const TlsMpi *N, unsigned long mm,
 
 	return 0;
 }
-#endif
 
 /**
  * Montgomery reduction: A = A * R^-1 mod N
@@ -1640,7 +1374,7 @@ ttls_mpi_gcd(TlsMpi *G, const TlsMpi *A, const TlsMpi *B)
 	}
 
 	if (lz)
-		ttls_mpi_shift_ll(&TB, lz);
+		ttls_mpi_shift_l(G, &TB, lz);
 	else
 		ttls_mpi_copy(G, &TB);
 }
diff --git a/tls/bignum.h b/tls/bignum.h
index aa0c4e1a6..3cdb676a7 100644
--- a/tls/bignum.h
+++ b/tls/bignum.h
@@ -183,7 +183,7 @@ int ttls_mpi_safe_cond_swap(TlsMpi *X, TlsMpi *Y, unsigned char swap);
 void ttls_mpi_lset(TlsMpi *X, long z);
 
 void ttls_mpi_shift_l(TlsMpi *X, const TlsMpi *A, size_t count);
-int ttls_mpi_shift_r(TlsMpi *X, size_t count);
+void ttls_mpi_shift_r(TlsMpi *X, size_t count);
 int ttls_mpi_get_bit(const TlsMpi *X, size_t pos);
 void ttls_mpi_set_bit(TlsMpi *X, size_t pos, unsigned char val);
 size_t ttls_mpi_lsb(const TlsMpi *X);
@@ -193,13 +193,13 @@ int ttls_mpi_cmp_abs(const TlsMpi *X, const TlsMpi *Y);
 int ttls_mpi_cmp_mpi(const TlsMpi *X, const TlsMpi *Y);
 int ttls_mpi_cmp_int(const TlsMpi *X, long z);
 
-int ttls_mpi_add_abs(TlsMpi *X, const TlsMpi *A, const TlsMpi *B);
+void ttls_mpi_add_abs(TlsMpi *X, const TlsMpi *A, const TlsMpi *B);
 void ttls_mpi_add_mpi(TlsMpi *X, const TlsMpi *A, const TlsMpi *B);
 void ttls_mpi_add_int(TlsMpi *X, const TlsMpi *A, long b);
 
-int ttls_mpi_sub_abs(TlsMpi *X, const TlsMpi *A, const TlsMpi *B);
-int ttls_mpi_sub_mpi(TlsMpi *X, const TlsMpi *A, const TlsMpi *B);
-int ttls_mpi_sub_int(TlsMpi *X, const TlsMpi *A, long b);
+void ttls_mpi_sub_abs(TlsMpi *X, const TlsMpi *A, const TlsMpi *B);
+void ttls_mpi_sub_mpi(TlsMpi *X, const TlsMpi *A, const TlsMpi *B);
+void ttls_mpi_sub_int(TlsMpi *X, const TlsMpi *A, long b);
 
 void ttls_mpi_mul_mpi(TlsMpi *X, const TlsMpi *A, const TlsMpi *B);
 void ttls_mpi_mul_uint(TlsMpi *X, const TlsMpi *A, unsigned long b);
diff --git a/tls/bignum_x86-64.S b/tls/bignum_x86-64.S
index 5588f91d7..29b470647 100644
--- a/tls/bignum_x86-64.S
+++ b/tls/bignum_x86-64.S
@@ -138,6 +138,116 @@ SYM_FUNC_START(mpi_add_mod_p256_x86_64)
 SYM_FUNC_END(mpi_add_mod_p256_x86_64)
 
 
+/**
+ * Subtract X = A - B, where A->used >= B->used.
+ *
+ * %RDI	- pointer to X;
+ * %RSI	- pointer to B;
+ * %RDX	- pointer to A;
+ * %RCX	- B->used (used directly for looping);
+ * %R8	- A->used.
+ *
+ * TODO #1335 it seems we can throw out the generic-length functions.
+ */
+SYM_FUNC_START(mpi_sub_x86_64)
+	subq	%rcx, %r8
+	addq	$1, %r8
+
+	/* Get code address by size of tail. */
+.section .rodata
+.align 8
+.sub_tail_jmp_tbl:
+	.quad	.sub_tail0
+	.quad	.sub_tail1
+	.quad	.sub_tail2
+	.quad	.sub_tail3
+.text
+	pushq	%rbx
+	movq	%rcx, %rbx
+	andq	$3, %rbx
+	movq	.sub_tail_jmp_tbl(, %rbx, 8), %rbx
+
+	xorq	%rax, %rax
+	shrq	$2, %rcx
+	jz	.sub_small_b
+	pushq	%r12
+	clc
+.sub_by_4:
+	movq	(%rdx, %rax, 8), %r9
+	movq	8(%rdx, %rax, 8), %r10
+	movq	16(%rdx, %rax, 8), %r11
+	movq	24(%rdx, %rax, 8), %r12
+	sbbq	(%rsi, %rax, 8), %r9
+	sbbq	8(%rsi, %rax, 8), %r10
+	sbbq	16(%rsi, %rax, 8), %r11
+	sbbq	24(%rsi, %rax, 8), %r12
+	movq	%r9, (%rdi, %rax, 8)
+	incq	%rax
+	movq	%r10, (%rdi, %rax, 8)
+	incq	%rax
+	movq	%r11, (%rdi, %rax, 8)
+	incq	%rax
+	movq	%r12, (%rdi, %rax, 8)
+	incq	%rax
+	loop	.sub_by_4
+	popq	%r12
+	ANNOTATE_RETPOLINE_SAFE
+	notrack jmpq	*%rbx
+.sub_small_b:
+	clc
+	ANNOTATE_RETPOLINE_SAFE
+	notrack jmpq	*%rbx
+
+.sub_tail3:
+	movq	(%rdx, %rax, 8), %r9
+	sbbq	(%rsi, %rax, 8), %r9
+	movq	%r9, (%rdi, %rax, 8)
+	incq	%rax
+.sub_tail2:
+	movq	(%rdx, %rax, 8), %r10
+	sbbq	(%rsi, %rax, 8), %r10
+	movq	%r10, (%rdi, %rax, 8)
+	incq	%rax
+.sub_tail1:
+	movq	(%rdx, %rax, 8), %r11
+	sbbq	(%rsi, %rax, 8), %r11
+	movq	%r11, (%rdi, %rax, 8)
+	incq	%rax
+.sub_tail0:
+	popq	%rbx
+
+	/*
+	 * Borrow required digets from the more significant limbs in @A.
+	 * There is either CF = 0 or we have more limbs in @A.
+	 */
+	movq	%r8, %rcx
+	jnc	.copy_msb
+	jmp	.propagate_borrow
+.propagate_borrow_loop:
+	movq	(%rdx, %rax, 8), %r10
+	sbbq	$0, %r10
+	movq	%r10, (%rdi, %rax, 8)
+	incq	%rax
+	jnc	.need_copy
+.propagate_borrow:
+	loop	.propagate_borrow_loop
+	ud2
+
+	/* Copy the rest of A to X if no need to borrow. */
+.copy_msb_loop:
+	movq	(%rdx, %rax, 8), %r10
+	movq	%r10, (%rdi, %rax, 8)
+	incq	%rax
+.copy_msb:
+	loop	.copy_msb_loop
+	RET
+
+.need_copy:
+	cmpq	%rdx, %rdi
+	jne	.copy_msb
+	RET
+SYM_FUNC_END(mpi_sub_x86_64)
+
 /*
  * Operands size specialized implementations of the function above.
  *

From 8afb1d6f54ac37ede6679feea891d3921152fa4d Mon Sep 17 00:00:00 2001
From: kingluo <home_king@163.com>
Date: Mon, 17 Jun 2024 19:31:02 +0800
Subject: [PATCH 14/25] clean up temporary changes

---
 fw/ss_skb.c   | 4 ++--
 fw/t/Makefile | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fw/ss_skb.c b/fw/ss_skb.c
index 9e27f9282..2ef58d0fe 100644
--- a/fw/ss_skb.c
+++ b/fw/ss_skb.c
@@ -331,7 +331,7 @@ __split_linear_data(struct sk_buff *skb_head, struct sk_buff *skb, char *pspt,
 
 	T_DBG3("[%d]: %s: skb [%p] pspt [%p] len [%d] tail_len [%d]\n",
 	       smp_processor_id(), __func__, skb, pspt, len, tail_len);
-	//BUG_ON(!skb->head_frag);
+	BUG_ON(!skb->head_frag);
 	BUG_ON(tail_len <= 0);
 	BUG_ON(!(alloc | tail_len));
 	BUG_ON(-len > tail_len);
@@ -1374,7 +1374,7 @@ ss_skb_queue_coalesce_tail(struct sk_buff **skb_head, const struct sk_buff *skb)
 	unsigned int headlen = skb_headlen(skb);
 
 	if (headlen) {
-		//BUG_ON(!skb->head_frag);
+		BUG_ON(!skb->head_frag);
 		head_frag.bv_len = headlen;
 		head_frag.bv_page = virt_to_page(skb->head);
 		head_frag.bv_offset = skb->data -
diff --git a/fw/t/Makefile b/fw/t/Makefile
index af59f23ff..d3c66254d 100644
--- a/fw/t/Makefile
+++ b/fw/t/Makefile
@@ -21,8 +21,8 @@ export TFW_CFLAGS
 EXTRA_CFLAGS += $(TFW_CFLAGS) -I$(src)/.. -I$(src)/../../
 EXTRA_CFLAGS += $(TTLS_CFLAGS)
 
-#obj-m += unit/
+obj-m += unit/
 
-#obj-m += tfw_fuzzer.o
+obj-m += tfw_fuzzer.o
 tfw_fuzzer-objs = \
 	fuzzer.o

From e95951a21ea4c445b37309f1f4d388428854fabb Mon Sep 17 00:00:00 2001
From: kingluo <home_king@163.com>
Date: Mon, 17 Jun 2024 19:33:27 +0800
Subject: [PATCH 15/25] add linux-6.8.9.patch

---
 linux-6.8.9.patch | 3175 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 3175 insertions(+)
 create mode 100644 linux-6.8.9.patch

diff --git a/linux-6.8.9.patch b/linux-6.8.9.patch
new file mode 100644
index 000000000..dcf8c282b
--- /dev/null
+++ b/linux-6.8.9.patch
@@ -0,0 +1,3175 @@
+diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
+index 31fdaf4fe..998e98f0b 100644
+--- a/Documentation/admin-guide/kernel-parameters.txt
++++ b/Documentation/admin-guide/kernel-parameters.txt
+@@ -6479,6 +6479,12 @@
+ 
+ 	tdfx=		[HW,DRM]
+ 
++	tempesta_dbmem=	[KNL]
++			Order of 2MB memory blocks reserved on each NUMA node
++			for Tempesta database. Huge pages are used if
++			possible. Minimum value to start Tempesta is 4 (32MB).
++			Default is 8, i.e. 512MB is reserved.
++
+ 	test_suspend=	[SUSPEND]
+ 			Format: { "mem" | "standby" | "freeze" }[,N]
+ 			Specify "mem" (for Suspend-to-RAM) or "standby" (for
+diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
+index a2be3aeff..4ad2582ac 100644
+--- a/arch/x86/include/asm/fpu/api.h
++++ b/arch/x86/include/asm/fpu/api.h
+@@ -26,6 +26,9 @@
+ #define KFPU_387	_BITUL(0)	/* 387 state will be initialized */
+ #define KFPU_MXCSR	_BITUL(1)	/* MXCSR will be initialized */
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++extern void __kernel_fpu_end_bh(void);
++#endif
+ extern void kernel_fpu_begin_mask(unsigned int kfpu_mask);
+ extern void kernel_fpu_end(void);
+ extern bool irq_fpu_usable(void);
+diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
+index 46603c6e4..c8a3ef380 100644
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -585,7 +585,7 @@ static __always_inline void setup_cet(struct cpuinfo_x86 *c)
+ 		set_cpu_cap(c, X86_FEATURE_USER_SHSTK);
+ 
+ 	if (kernel_ibt)
+-		wrmsrl(MSR_IA32_S_CET, CET_ENDBR_EN);
++		wrmsrl(MSR_IA32_S_CET, CET_ENDBR_EN | CET_NO_TRACK_EN);
+ 	else
+ 		wrmsrl(MSR_IA32_S_CET, 0);
+ 
+diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
+index 520deb411..cb5aecdda 100644
+--- a/arch/x86/kernel/fpu/core.c
++++ b/arch/x86/kernel/fpu/core.c
+@@ -57,6 +57,10 @@ DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
+  */
+ bool irq_fpu_usable(void)
+ {
++#ifdef CONFIG_SECURITY_TEMPESTA
++	if (likely(in_serving_softirq()))
++		return true;
++#endif
+ 	if (WARN_ON_ONCE(in_nmi()))
+ 		return false;
+ 
+@@ -420,7 +424,19 @@ EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate);
+ 
+ void kernel_fpu_begin_mask(unsigned int kfpu_mask)
+ {
++#ifdef CONFIG_SECURITY_TEMPESTA
++	/*
++	 * We don't know in which context the function is called, but we know
++	 * preciseely that softirq uses FPU, so we have to disable softirq as
++	 * well as task preemption.
++	 */
++	if (!in_serving_softirq())
++		local_bh_disable();
++	else if (this_cpu_read(in_kernel_fpu))
++		return;
++#else
+ 	preempt_disable();
++#endif
+ 
+ 	WARN_ON_FPU(!irq_fpu_usable());
+ 	WARN_ON_FPU(this_cpu_read(in_kernel_fpu));
+@@ -443,12 +459,28 @@ void kernel_fpu_begin_mask(unsigned int kfpu_mask)
+ }
+ EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask);
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++void __kernel_fpu_end_bh(void)
++{
++	this_cpu_write(in_kernel_fpu, false);
++}
++#endif
++
+ void kernel_fpu_end(void)
+ {
++#ifdef CONFIG_SECURITY_TEMPESTA
++	if (likely(in_serving_softirq()))
++		return;
++#endif
+ 	WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
+ 
+ 	this_cpu_write(in_kernel_fpu, false);
++
++#ifdef CONFIG_SECURITY_TEMPESTA
++	local_bh_enable();
++#else
+ 	preempt_enable();
++#endif
+ }
+ EXPORT_SYMBOL_GPL(kernel_fpu_end);
+ 
+diff --git a/crypto/aead.c b/crypto/aead.c
+index 549066335..c899216cc 100644
+--- a/crypto/aead.c
++++ b/crypto/aead.c
+@@ -275,6 +275,24 @@ int crypto_has_aead(const char *alg_name, u32 type, u32 mask)
+ }
+ EXPORT_SYMBOL_GPL(crypto_has_aead);
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++struct crypto_alg *
++crypto_find_aead(const char *alg_name, u32 type, u32 mask)
++{
++	return crypto_find_alg(alg_name, &crypto_aead_type, type, mask);
++}
++EXPORT_SYMBOL_GPL(crypto_find_aead);
++
++struct crypto_aead *
++crypto_alloc_aead_atomic(struct crypto_alg *alg)
++{
++	alg = crypto_mod_get(alg);
++	BUG_ON(!alg);
++	return crypto_create_tfm(alg, &crypto_aead_type);
++}
++EXPORT_SYMBOL_GPL(crypto_alloc_aead_atomic);
++#endif
++
+ static int aead_prepare_alg(struct aead_alg *alg)
+ {
+ 	struct crypto_istat_aead *istat = aead_get_stat(alg);
+diff --git a/crypto/ahash.c b/crypto/ahash.c
+index 80c3e5354..2ca0ed367 100644
+--- a/crypto/ahash.c
++++ b/crypto/ahash.c
+@@ -612,6 +612,25 @@ struct crypto_ahash *crypto_alloc_ahash(const char *alg_name, u32 type,
+ }
+ EXPORT_SYMBOL_GPL(crypto_alloc_ahash);
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++/* Asynch hash is required by GHASH used in GCM. */
++struct crypto_alg *
++crypto_find_ahash(const char *alg_name, u32 type, u32 mask)
++{
++	return crypto_find_alg(alg_name, &crypto_ahash_type, type, mask);
++}
++EXPORT_SYMBOL_GPL(crypto_find_ahash);
++
++struct crypto_ahash *
++crypto_alloc_ahash_atomic(struct crypto_alg *alg)
++{
++	alg = crypto_mod_get(alg);
++	BUG_ON(!alg);
++	return crypto_create_tfm(alg, &crypto_ahash_type);
++}
++EXPORT_SYMBOL_GPL(crypto_alloc_ahash_atomic);
++#endif
++
+ int crypto_has_ahash(const char *alg_name, u32 type, u32 mask)
+ {
+ 	return crypto_type_has_alg(alg_name, &crypto_ahash_type, type, mask);
+diff --git a/crypto/api.c b/crypto/api.c
+index 7f402107f..6a2f70072 100644
+--- a/crypto/api.c
++++ b/crypto/api.c
+@@ -513,7 +513,11 @@ void *crypto_create_tfm_node(struct crypto_alg *alg,
+ 	char *mem;
+ 	int err;
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++	mem = crypto_alloc_tfmmem(alg, frontend, node, GFP_ATOMIC);
++#else
+ 	mem = crypto_alloc_tfmmem(alg, frontend, node, GFP_KERNEL);
++#endif
+ 	if (IS_ERR(mem))
+ 		goto out;
+ 
+@@ -570,6 +574,9 @@ struct crypto_alg *crypto_find_alg(const char *alg_name,
+ 				   const struct crypto_type *frontend,
+ 				   u32 type, u32 mask)
+ {
++	/* The function is slow and preemptable to be called in softirq. */
++	WARN_ON_ONCE(in_serving_softirq());
++
+ 	if (frontend) {
+ 		type &= frontend->maskclear;
+ 		mask &= frontend->maskclear;
+diff --git a/crypto/cryptd.c b/crypto/cryptd.c
+index 31d022d47..0cfe2f0cc 100644
+--- a/crypto/cryptd.c
++++ b/crypto/cryptd.c
+@@ -27,6 +27,8 @@
+ #include <linux/slab.h>
+ #include <linux/workqueue.h>
+ 
++#include "internal.h"
++
+ static unsigned int cryptd_max_cpu_qlen = 1000;
+ module_param(cryptd_max_cpu_qlen, uint, 0);
+ MODULE_PARM_DESC(cryptd_max_cpu_qlen, "Set cryptd Max queue depth");
+@@ -946,6 +948,75 @@ static struct crypto_template cryptd_tmpl = {
+ 	.module = THIS_MODULE,
+ };
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++
++#define MAX_CACHED_ALG_COUNT	8
++struct alg_cache {
++	int n;
++	spinlock_t lock;
++	struct {
++		u32 type;
++		u32 mask;
++		struct crypto_alg *alg;
++		char alg_name[CRYPTO_MAX_ALG_NAME];
++	} a[MAX_CACHED_ALG_COUNT];
++};
++
++static struct alg_cache skcipher_alg_cache;
++static struct alg_cache ahash_alg_cache;
++static struct alg_cache aead_alg_cache;
++
++/*
++ * Finds a previously allocated algorithm or allocates a new one. In any case,
++ * returned alg holds at least one reference to its module.
++ */
++static struct crypto_alg *
++cryptd_find_alg_cached(const char *cryptd_alg_name, u32 type, u32 mask,
++		       struct crypto_alg *(*find_alg)(const char *, u32, u32),
++		       struct alg_cache *__restrict ac)
++{
++	struct crypto_alg *alg;
++	int k;
++
++	spin_lock(&ac->lock);
++	for (k = 0; k < ac->n; k++) {
++		if (strcmp(ac->a[k].alg_name, cryptd_alg_name) == 0
++		    && ac->a[k].type == type && ac->a[k].mask == mask)
++		{
++			spin_unlock(&ac->lock);
++			return ac->a[k].alg;
++		}
++	}
++	spin_unlock(&ac->lock);
++
++	/* Searching for the algorithm may sleep, so warn about it. */
++	WARN_ON_ONCE(in_serving_softirq());
++
++	alg = find_alg(cryptd_alg_name, type, mask);
++	if (IS_ERR(alg))
++		return alg;
++
++	spin_lock(&ac->lock);
++	if (ac->n >= MAX_CACHED_ALG_COUNT) {
++		spin_unlock(&ac->lock);
++		BUG();
++		return ERR_PTR(-ENOMEM);
++	}
++
++	snprintf(ac->a[ac->n].alg_name, sizeof(ac->a[ac->n].alg_name), "%s",
++		 cryptd_alg_name);
++
++	ac->a[ac->n].type = type;
++	ac->a[ac->n].mask = mask;
++	ac->a[ac->n].alg = alg;
++
++	ac->n += 1;
++	spin_unlock(&ac->lock);
++
++	return alg;
++}
++#endif /* CONFIG_SECURITY_TEMPESTA */
++
+ struct cryptd_skcipher *cryptd_alloc_skcipher(const char *alg_name,
+ 					      u32 type, u32 mask)
+ {
+@@ -957,7 +1028,20 @@ struct cryptd_skcipher *cryptd_alloc_skcipher(const char *alg_name,
+ 		     "cryptd(%s)", alg_name) >= CRYPTO_MAX_ALG_NAME)
+ 		return ERR_PTR(-EINVAL);
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++	{
++		struct crypto_alg *alg =
++			cryptd_find_alg_cached(cryptd_alg_name, type, mask,
++					       crypto_find_skcipher,
++					       &skcipher_alg_cache);
++		if (IS_ERR(alg))
++			return (struct cryptd_skcipher *)alg;
++
++		tfm = crypto_alloc_skcipher_atomic(alg);
++	}
++#else
+ 	tfm = crypto_alloc_skcipher(cryptd_alg_name, type, mask);
++#endif
+ 	if (IS_ERR(tfm))
+ 		return ERR_CAST(tfm);
+ 
+@@ -1008,7 +1092,21 @@ struct cryptd_ahash *cryptd_alloc_ahash(const char *alg_name,
+ 	if (snprintf(cryptd_alg_name, CRYPTO_MAX_ALG_NAME,
+ 		     "cryptd(%s)", alg_name) >= CRYPTO_MAX_ALG_NAME)
+ 		return ERR_PTR(-EINVAL);
++
++#ifdef CONFIG_SECURITY_TEMPESTA
++	{
++		struct crypto_alg *alg =
++			cryptd_find_alg_cached(cryptd_alg_name, type, mask,
++					       crypto_find_ahash,
++					       &ahash_alg_cache);
++		if (IS_ERR(alg))
++			return (struct cryptd_ahash *)alg;
++
++		tfm = crypto_alloc_ahash_atomic(alg);
++	}
++#else
+ 	tfm = crypto_alloc_ahash(cryptd_alg_name, type, mask);
++#endif
+ 	if (IS_ERR(tfm))
+ 		return ERR_CAST(tfm);
+ 	if (tfm->base.__crt_alg->cra_module != THIS_MODULE) {
+@@ -1065,7 +1163,21 @@ struct cryptd_aead *cryptd_alloc_aead(const char *alg_name,
+ 	if (snprintf(cryptd_alg_name, CRYPTO_MAX_ALG_NAME,
+ 		     "cryptd(%s)", alg_name) >= CRYPTO_MAX_ALG_NAME)
+ 		return ERR_PTR(-EINVAL);
++
++#ifdef CONFIG_SECURITY_TEMPESTA
++	{
++		struct crypto_alg *alg =
++			cryptd_find_alg_cached(cryptd_alg_name, type, mask,
++					       crypto_find_aead,
++					       &aead_alg_cache);
++		if (IS_ERR(alg))
++			return (struct cryptd_aead *)alg;
++
++		tfm = crypto_alloc_aead_atomic(alg);
++	}
++#else
+ 	tfm = crypto_alloc_aead(cryptd_alg_name, type, mask);
++#endif
+ 	if (IS_ERR(tfm))
+ 		return ERR_CAST(tfm);
+ 	if (tfm->base.__crt_alg->cra_module != THIS_MODULE) {
+diff --git a/crypto/shash.c b/crypto/shash.c
+index c3f7f6a25..d6b24abdb 100644
+--- a/crypto/shash.c
++++ b/crypto/shash.c
+@@ -368,6 +368,24 @@ int hash_prepare_alg(struct hash_alg_common *alg)
+ 	return 0;
+ }
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++struct crypto_alg *
++crypto_find_shash(const char *alg_name, u32 type, u32 mask)
++{
++	return crypto_find_alg(alg_name, &crypto_shash_type, type, mask);
++}
++EXPORT_SYMBOL_GPL(crypto_find_shash);
++
++struct crypto_shash *
++crypto_alloc_shash_atomic(struct crypto_alg *alg)
++{
++	alg = crypto_mod_get(alg);
++	BUG_ON(!alg);
++	return crypto_create_tfm(alg, &crypto_shash_type);
++}
++EXPORT_SYMBOL_GPL(crypto_alloc_shash_atomic);
++#endif
++
+ static int shash_prepare_alg(struct shash_alg *alg)
+ {
+ 	struct crypto_alg *base = &alg->halg.base;
+diff --git a/crypto/skcipher.c b/crypto/skcipher.c
+index bc70e159d..c80a42103 100644
+--- a/crypto/skcipher.c
++++ b/crypto/skcipher.c
+@@ -903,6 +903,24 @@ struct crypto_skcipher *crypto_alloc_skcipher(const char *alg_name,
+ }
+ EXPORT_SYMBOL_GPL(crypto_alloc_skcipher);
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++struct crypto_alg *
++crypto_find_skcipher(const char *alg_name, u32 type, u32 mask)
++{
++	return crypto_find_alg(alg_name, &crypto_skcipher_type, type, mask);
++}
++EXPORT_SYMBOL_GPL(crypto_find_skcipher);
++
++struct crypto_skcipher *
++crypto_alloc_skcipher_atomic(struct crypto_alg *alg)
++{
++	alg = crypto_mod_get(alg);
++	BUG_ON(!alg);
++	return crypto_create_tfm(alg, &crypto_skcipher_type);
++}
++EXPORT_SYMBOL_GPL(crypto_alloc_skcipher_atomic);
++#endif
++
+ struct crypto_sync_skcipher *crypto_alloc_sync_skcipher(
+ 				const char *alg_name, u32 type, u32 mask)
+ {
+diff --git a/include/crypto/aead.h b/include/crypto/aead.h
+index 51382befb..c5e326650 100644
+--- a/include/crypto/aead.h
++++ b/include/crypto/aead.h
+@@ -201,6 +201,11 @@ static inline struct crypto_aead *__crypto_aead_cast(struct crypto_tfm *tfm)
+  */
+ struct crypto_aead *crypto_alloc_aead(const char *alg_name, u32 type, u32 mask);
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++struct crypto_alg *crypto_find_aead(const char *alg_name, u32 type, u32 mask);
++struct crypto_aead *crypto_alloc_aead_atomic(struct crypto_alg *alg);
++#endif
++
+ static inline struct crypto_tfm *crypto_aead_tfm(struct crypto_aead *tfm)
+ {
+ 	return &tfm->base;
+diff --git a/include/crypto/hash.h b/include/crypto/hash.h
+index 5d61f576c..e073d4396 100644
+--- a/include/crypto/hash.h
++++ b/include/crypto/hash.h
+@@ -291,6 +291,11 @@ struct crypto_ahash *crypto_alloc_ahash(const char *alg_name, u32 type,
+ 
+ struct crypto_ahash *crypto_clone_ahash(struct crypto_ahash *tfm);
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++struct crypto_alg *crypto_find_ahash(const char *alg_name, u32 type, u32 mask);
++struct crypto_ahash *crypto_alloc_ahash_atomic(struct crypto_alg *alg);
++#endif
++
+ static inline struct crypto_tfm *crypto_ahash_tfm(struct crypto_ahash *tfm)
+ {
+ 	return &tfm->base;
+@@ -704,6 +709,11 @@ struct crypto_shash *crypto_clone_shash(struct crypto_shash *tfm);
+ 
+ int crypto_has_shash(const char *alg_name, u32 type, u32 mask);
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++struct crypto_alg *crypto_find_shash(const char *alg_name, u32 type, u32 mask);
++struct crypto_shash *crypto_alloc_shash_atomic(struct crypto_alg *alg);
++#endif
++
+ static inline struct crypto_tfm *crypto_shash_tfm(struct crypto_shash *tfm)
+ {
+ 	return &tfm->base;
+diff --git a/include/crypto/skcipher.h b/include/crypto/skcipher.h
+index c8857d7bd..c34c00a7c 100644
+--- a/include/crypto/skcipher.h
++++ b/include/crypto/skcipher.h
+@@ -324,6 +324,12 @@ struct crypto_sync_skcipher *crypto_alloc_sync_skcipher(const char *alg_name,
+ struct crypto_lskcipher *crypto_alloc_lskcipher(const char *alg_name,
+ 						u32 type, u32 mask);
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++struct crypto_alg *crypto_find_skcipher(const char *alg_name, u32 type,
++					u32 mask);
++struct crypto_skcipher *crypto_alloc_skcipher_atomic(struct crypto_alg *alg);
++#endif
++
+ static inline struct crypto_tfm *crypto_skcipher_tfm(
+ 	struct crypto_skcipher *tfm)
+ {
+diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
+index 89a6888f2..331a9fc0b 100644
+--- a/include/linux/fortify-string.h
++++ b/include/linux/fortify-string.h
+@@ -14,7 +14,7 @@ void __read_overflow(void) __compiletime_error("detected read beyond size of obj
+ void __read_overflow2(void) __compiletime_error("detected read beyond size of object (2nd parameter)");
+ void __read_overflow2_field(size_t avail, size_t wanted) __compiletime_warning("detected read beyond size of field (2nd parameter); maybe use struct_group()?");
+ void __write_overflow(void) __compiletime_error("detected write beyond size of object (1st parameter)");
+-void __write_overflow_field(size_t avail, size_t wanted) __compiletime_warning("detected write beyond size of field (1st parameter); maybe use struct_group()?");
++void __write_overflow_field(size_t avail, size_t wanted);
+ 
+ #define __compiletime_strlen(p)					\
+ ({								\
+diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
+index 5c9bdd3ff..b0770d277 100644
+--- a/include/linux/interrupt.h
++++ b/include/linux/interrupt.h
+@@ -547,13 +547,13 @@ DECLARE_STATIC_KEY_FALSE(force_irqthreads_key);
+    tasklets are more than enough. F.e. all serial device BHs et
+    al. should be converted to tasklets, not to softirqs.
+  */
+-
++/* Tempesta: process RX before TX to proxy traffic in one softirq shot. */
+ enum
+ {
+ 	HI_SOFTIRQ=0,
+ 	TIMER_SOFTIRQ,
+-	NET_TX_SOFTIRQ,
+ 	NET_RX_SOFTIRQ,
++	NET_TX_SOFTIRQ,
+ 	BLOCK_SOFTIRQ,
+ 	IRQ_POLL_SOFTIRQ,
+ 	TASKLET_SOFTIRQ,
+@@ -610,7 +610,7 @@ extern void softirq_init(void);
+ extern void __raise_softirq_irqoff(unsigned int nr);
+ 
+ extern void raise_softirq_irqoff(unsigned int nr);
+-extern void raise_softirq(unsigned int nr);
++void raise_softirq(unsigned int nr);
+ 
+ DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
+ 
+diff --git a/include/linux/net.h b/include/linux/net.h
+index c9b4a6379..fe0119212 100644
+--- a/include/linux/net.h
++++ b/include/linux/net.h
+@@ -235,6 +235,8 @@ struct net_proto_family {
+ 	struct module	*owner;
+ };
+ 
++extern const struct net_proto_family *get_proto_family(int family);
++
+ struct iovec;
+ struct kvec;
+ 
+diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
+index dba428b3a..20f90054f 100644
+--- a/include/linux/netdevice.h
++++ b/include/linux/netdevice.h
+@@ -167,11 +167,22 @@ static inline bool dev_xmit_complete(int rc)
+ # define LL_MAX_HEADER 32
+ #endif
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++/*
++ * For Tempesta case the most traffic is TLS encrypted, so we need the extra
++ * room for TLS record header and explicit IV on skb allocation to avoid data
++ * movement on tcp_write_xmit(). Not all skbs have TLS headers - not a big deal
++ * to allocate 16 more bytes (5 - TLS header, 8 - IV, 3 - alignment).
++ */
++#define TLS_MAX_HDR		16
++#else
++#define TLS_MAX_HDR		0
++#endif
+ #if !IS_ENABLED(CONFIG_NET_IPIP) && !IS_ENABLED(CONFIG_NET_IPGRE) && \
+     !IS_ENABLED(CONFIG_IPV6_SIT) && !IS_ENABLED(CONFIG_IPV6_TUNNEL)
+-#define MAX_HEADER LL_MAX_HEADER
++#define MAX_HEADER (LL_MAX_HEADER + TLS_MAX_HDR)
+ #else
+-#define MAX_HEADER (LL_MAX_HEADER + 48)
++#define MAX_HEADER (LL_MAX_HEADER + 48 + TLS_MAX_HDR)
+ #endif
+ 
+ /*
+diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
+index 5bafcfe18..b05a07d99 100644
+--- a/include/linux/skbuff.h
++++ b/include/linux/skbuff.h
+@@ -266,6 +266,12 @@
+ 	SKB_WITH_OVERHEAD((PAGE_SIZE << (ORDER)) - (X))
+ #define SKB_MAX_HEAD(X)		(SKB_MAX_ORDER((X), 0))
+ #define SKB_MAX_ALLOC		(SKB_MAX_ORDER(0, 2))
++#ifdef CONFIG_SECURITY_TEMPESTA
++#define SKB_MAX_HEADER	(PAGE_SIZE - MAX_TCP_HEADER			\
++			 - SKB_DATA_ALIGN(sizeof(struct sk_buff))	\
++			 - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) \
++			 - SKB_DATA_ALIGN(1))
++#endif
+ 
+ /* return minimum truesize of one skb containing X bytes of data */
+ #define SKB_TRUESIZE(X) ((X) +						\
+@@ -861,6 +867,14 @@ struct sk_buff {
+ 				 * UDP receive path is one user.
+ 				 */
+ 				unsigned long		dev_scratch;
++#ifdef CONFIG_SECURITY_TEMPESTA
++                                struct {
++                                        __u8    present : 1;
++                                        __u8    tls_type : 7;
++                                        __u16   flags : 16;
++                                        unsigned int cb;
++                                } tfw_cb;
++#endif
+ 			};
+ 		};
+ 		struct rb_node		rbnode; /* used in netem, ip4 defrag, and tcp stack */
+@@ -922,11 +936,17 @@ struct sk_buff {
+ 				fclone:2,
+ 				peeked:1,
+ 				head_frag:1,
++#ifdef CONFIG_SECURITY_TEMPESTA
++				skb_page:1,
++#endif
+ 				pfmemalloc:1,
+ 				pp_recycle:1; /* page_pool recycle indicator */
+ #ifdef CONFIG_SKB_EXTENSIONS
+ 	__u8			active_extensions;
+ #endif
++#ifdef CONFIG_SECURITY_TEMPESTA
++        __u8                    tail_lock:1;
++#endif
+ 
+ 	/* Fields enclosed in headers group are copied
+ 	 * using a single memcpy() in __copy_skb_header()
+@@ -1096,6 +1116,98 @@ struct sk_buff {
+ #define SKB_ALLOC_RX		0x02
+ #define SKB_ALLOC_NAPI		0x04
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++long __get_skb_count(void);
++
++enum {
++	/* This skb contains start of http2 frame. */
++	SS_F_HTTP2_FRAME_START                  = 0x01,
++	/* This skb contains new hpack dynamic table size. */
++	SS_F_HTTT2_HPACK_TBL_SZ_ENCODED         = 0x02,
++	/* This skb contains headers frame. */
++	SS_F_HTTT2_FRAME_HEADERS                = 0x04,
++	/* This skb contains data frame. */
++	SS_F_HTTT2_FRAME_DATA                   = 0x08,
++	/* This skb was already prepared. */
++	SS_F_HTTP2_FRAME_PREPARED               = 0x10,
++	/* This skb acks new hpack dynamic tbl size. */
++	SS_F_HTTP2_ACK_FOR_HPACK_TBL_RESIZING   = 0x20,
++	/*
++	 * These flags should be cleared when we copy flags
++	 * from one skb to another one.
++	 */
++	TEMPESTA_SKB_FLAG_CLEAR_MASK	= SS_F_HTTP2_ACK_FOR_HPACK_TBL_RESIZING |
++					  SS_F_HTTT2_HPACK_TBL_SZ_ENCODED |
++					  SS_F_HTTP2_FRAME_START,
++};
++
++static inline unsigned long
++skb_tfw_is_present(struct sk_buff *skb)
++{
++	return skb->tfw_cb.present;
++}
++
++static inline void
++skb_set_tfw_tls_type(struct sk_buff *skb, unsigned char tls_type)
++{
++        BUG_ON(tls_type > 0x7F);
++        skb->tfw_cb.present = 1;
++        skb->tfw_cb.tls_type = tls_type;
++}
++
++static inline unsigned char
++skb_tfw_tls_type(struct sk_buff *skb)
++{
++	return skb->tfw_cb.present ? skb->tfw_cb.tls_type : 0;
++}
++
++static inline void
++skb_set_tfw_flags(struct sk_buff *skb, unsigned short flags)
++{
++        skb->tfw_cb.present = 1;
++        skb->tfw_cb.flags |= flags;
++}
++
++static inline void
++skb_clear_tfw_flag(struct sk_buff *skb, unsigned short flag)
++{
++        skb->tfw_cb.flags &= ~flag;
++}
++
++static inline unsigned short
++skb_tfw_flags(struct sk_buff *skb)
++{
++        return skb->tfw_cb.present ? skb->tfw_cb.flags : 0;
++}
++
++static inline void
++skb_set_tfw_cb(struct sk_buff *skb, unsigned int cb)
++{
++        skb->tfw_cb.present = 1;
++        skb->tfw_cb.cb = cb;
++}
++
++static inline unsigned int
++skb_tfw_cb(struct sk_buff *skb)
++{
++        return skb->tfw_cb.present ? skb->tfw_cb.cb : 0;
++}
++
++static inline void
++skb_copy_tfw_cb(struct sk_buff *dst, struct sk_buff *src)
++{
++	dst->dev = src->dev;
++}
++
++static inline void
++skb_clear_tfw_cb(struct sk_buff *skb)
++{
++	WARN_ON_ONCE(!skb->tfw_cb.present);
++	skb->dev = NULL;
++}
++
++#endif
++
+ /**
+  * skb_pfmemalloc - Test if the skb was allocated from PFMEMALLOC reserves
+  * @skb: buffer
+@@ -1267,6 +1379,7 @@ void kfree_skb_partial(struct sk_buff *skb, bool head_stolen);
+ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
+ 		      bool *fragstolen, int *delta_truesize);
+ 
++void *pg_skb_alloc(unsigned int size, gfp_t gfp_mask, int node);
+ struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags,
+ 			    int node);
+ struct sk_buff *__build_skb(void *data, unsigned int frag_size);
+@@ -2402,7 +2515,11 @@ struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list);
+ 
+ static inline bool skb_is_nonlinear(const struct sk_buff *skb)
+ {
++#ifdef CONFIG_SECURITY_TEMPESTA
++	return skb->tail_lock || skb->data_len;
++#else
+ 	return skb->data_len;
++#endif
+ }
+ 
+ static inline unsigned int skb_headlen(const struct sk_buff *skb)
+@@ -2714,6 +2831,20 @@ static inline unsigned int skb_headroom(const struct sk_buff *skb)
+ 	return skb->data - skb->head;
+ }
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++/**
++ *	skb_tailroom_locked - bytes at buffer end
++ *	@skb: buffer to check
++ *
++ *	Return the number of bytes of free space at the tail of an sk_buff with
++ *	respect to tail locking only.
++ */
++static inline int skb_tailroom_locked(const struct sk_buff *skb)
++{
++	return skb->tail_lock ? 0 : skb->end - skb->tail;
++}
++#endif
++
+ /**
+  *	skb_tailroom - bytes at buffer end
+  *	@skb: buffer to check
+diff --git a/include/linux/tempesta.h b/include/linux/tempesta.h
+new file mode 100644
+index 000000000..90eedcba5
+--- /dev/null
++++ b/include/linux/tempesta.h
+@@ -0,0 +1,55 @@
++/**
++ * Linux interface for Tempesta FW.
++ *
++ * Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com).
++ * Copyright (C) 2015-2023 Tempesta Technologies, Inc.
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License,
++ * or (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful, but WITHOUT
++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++ * FITNESS FOR A PARTICULAR PURPOSE.
++ * See the GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License along with
++ * this program; if not, write to the Free Software Foundation, Inc., 59
++ * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
++ */
++#ifndef __TEMPESTA_H__
++#define __TEMPESTA_H__
++
++#include <net/sock.h>
++
++typedef void (*TempestaTxAction)(void);
++
++typedef struct {
++	int (*sk_alloc)(struct sock *sk, struct sk_buff *skb);
++	void (*sk_free)(struct sock *sk);
++	int (*sock_tcp_rcv)(struct sock *sk, struct sk_buff *skb);
++} TempestaOps;
++
++typedef struct {
++	unsigned long	addr;
++	unsigned long	pages; /* number of 4KB pages */
++} TempestaMapping;
++
++/* Security hooks. */
++int tempesta_new_clntsk(struct sock *newsk, struct sk_buff *skb);
++void tempesta_close_clntsk(struct sock *sk);
++void tempesta_register_ops(TempestaOps *tops);
++void tempesta_unregister_ops(TempestaOps *tops);
++
++/* Network hooks. */
++void tempesta_set_tx_action(TempestaTxAction action);
++void tempesta_del_tx_action(void);
++
++/* Memory management. */
++void tempesta_reserve_pages(void);
++void tempesta_reserve_vmpages(void);
++int tempesta_get_mapping(int node, TempestaMapping **tm);
++
++#endif /* __TEMPESTA_H__ */
++
+diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
+index d94c242eb..90b5f794c 100644
+--- a/include/net/inet_sock.h
++++ b/include/net/inet_sock.h
+@@ -87,7 +87,12 @@ struct inet_request_sock {
+ 				ecn_ok	   : 1,
+ 				acked	   : 1,
+ 				no_srccheck: 1,
++#ifdef CONFIG_SECURITY_TEMPESTA
++				smc_ok	   : 1,
++				aborted	   : 1;
++#else
+ 				smc_ok	   : 1;
++#endif
+ 	u32                     ir_mark;
+ 	union {
+ 		struct ip_options_rcu __rcu	*ireq_opt;
+diff --git a/include/net/sock.h b/include/net/sock.h
+index 54a796761..8c679819d 100644
+--- a/include/net/sock.h
++++ b/include/net/sock.h
+@@ -513,6 +513,19 @@ struct sock {
+ 	void			(*sk_state_change)(struct sock *sk);
+ 	void			(*sk_data_ready)(struct sock *sk);
+ 	void			(*sk_write_space)(struct sock *sk);
++#ifdef CONFIG_SECURITY_TEMPESTA
++	int			(*sk_prepare_xmit)(struct sock *sk,
++						   struct sk_buff *skb,
++						   unsigned int mss_now,
++						   unsigned int *limit,
++						   unsigned int *skbs);
++	int			(*sk_write_xmit)(struct sock *sk,
++						 struct sk_buff *skb,
++						 unsigned int mss_now,
++						 unsigned int limit,
++						 unsigned int skbs);
++	void			(*sk_destroy_cb)(struct sock *sk);
++#endif
+ 	void			(*sk_error_report)(struct sock *sk);
+ 	int			(*sk_backlog_rcv)(struct sock *sk,
+ 						  struct sk_buff *skb);
+@@ -930,6 +943,9 @@ enum sock_flags {
+ 	SOCK_XDP, /* XDP is attached */
+ 	SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */
+ 	SOCK_RCVMARK, /* Receive SO_MARK  ancillary data with packet */
++#ifdef CONFIG_SECURITY_TEMPESTA
++	SOCK_TEMPESTA, /* The socket is managed by Tempesta FW */
++#endif
+ };
+ 
+ #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
+@@ -1174,6 +1190,16 @@ static inline void sock_rps_reset_rxhash(struct sock *sk)
+ 		__rc;							\
+ 	})
+ 
++/**
++ * sk_stream_closing - Return 1 if we still have things to send in our buffers.
++ * @sk: socket to verify
++ */
++static inline int sk_stream_closing(struct sock *sk)
++{
++	return (1 << sk->sk_state) &
++	       (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK);
++}
++
+ int sk_stream_wait_connect(struct sock *sk, long *timeo_p);
+ int sk_stream_wait_memory(struct sock *sk, long *timeo_p);
+ void sk_stream_wait_close(struct sock *sk, long timeo_p);
+@@ -2136,8 +2162,7 @@ static inline bool sk_rethink_txhash(struct sock *sk)
+ static inline struct dst_entry *
+ __sk_dst_get(const struct sock *sk)
+ {
+-	return rcu_dereference_check(sk->sk_dst_cache,
+-				     lockdep_sock_is_held(sk));
++	return rcu_dereference_raw(sk->sk_dst_cache);
+ }
+ 
+ static inline struct dst_entry *
+diff --git a/include/net/tcp.h b/include/net/tcp.h
+index f6eba9652..4c9e00994 100644
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -318,6 +318,7 @@ bool tcp_check_oom(struct sock *sk, int shift);
+ 
+ 
+ extern struct proto tcp_prot;
++extern struct proto tcpv6_prot;
+ 
+ #define TCP_INC_STATS(net, field)	SNMP_INC_STATS((net)->mib.tcp_statistics, field)
+ #define __TCP_INC_STATS(net, field)	__SNMP_INC_STATS((net)->mib.tcp_statistics, field)
+@@ -615,6 +616,8 @@ enum tcp_queue {
+ 	TCP_FRAG_IN_WRITE_QUEUE,
+ 	TCP_FRAG_IN_RTX_QUEUE,
+ };
++int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
++		 unsigned int mss_now, gfp_t gfp);
+ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+ 		 struct sk_buff *skb, u32 len,
+ 		 unsigned int mss_now, gfp_t gfp);
+@@ -684,6 +687,22 @@ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
+ /* tcp.c */
+ void tcp_get_info(struct sock *, struct tcp_info *);
+ 
++/* Routines required by Tempesta FW. */
++void tcp_cleanup_rbuf(struct sock *sk, int copied);
++extern void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle,
++		     int size_goal);
++extern int tcp_send_mss(struct sock *sk, int *size_goal, int flags);
++extern void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb);
++extern void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags);
++extern void tcp_queue_skb(struct sock *sk, struct sk_buff *skb);
++extern void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now);
++extern void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb,
++			      int decr);
++extern void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2);
++extern void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2);
++extern int tcp_close_state(struct sock *sk);
++extern void skb_entail(struct sock *sk, struct sk_buff *skb);
++
+ /* Read 'sendfile()'-style from a TCP socket */
+ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
+ 		  sk_read_actor_t recv_actor);
+diff --git a/include/net/tls.h b/include/net/tls.h
+index 33f657d3c..1b5286933 100644
+--- a/include/net/tls.h
++++ b/include/net/tls.h
+@@ -67,6 +67,13 @@ struct tls_rec;
+ #define TLS_MAX_REC_SEQ_SIZE		8
+ #define TLS_MAX_AAD_SIZE		TLS_AAD_SPACE_SIZE
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++#define TLS_MAX_TAG_SZ			16
++/* Maximum size for required skb overhead: header, IV, tag. */
++#define TLS_MAX_OVERHEAD		(TLS_HEADER_SIZE + TLS_AAD_SPACE_SIZE \
++					 + TLS_MAX_TAG_SZ)
++#endif
++
+ /* For CCM mode, the full 16-bytes of IV is made of '4' fields of given sizes.
+  *
+  * IV[16] = b0[1] || implicit nonce[4] || explicit nonce[8] || length[3]
+diff --git a/include/uapi/linux/lsm.h b/include/uapi/linux/lsm.h
+index f8aef9ade..077dbdefa 100644
+--- a/include/uapi/linux/lsm.h
++++ b/include/uapi/linux/lsm.h
+@@ -62,6 +62,7 @@ struct lsm_ctx {
+ #define LSM_ID_LOCKDOWN		108
+ #define LSM_ID_BPF		109
+ #define LSM_ID_LANDLOCK		110
++#define LSM_ID_TEMPESTA		111
+ 
+ /*
+  * LSM_ATTR_XXX definitions identify different LSM attributes
+diff --git a/kernel/irq_work.c b/kernel/irq_work.c
+index 2f4fb336d..a65f9370e 100644
+--- a/kernel/irq_work.c
++++ b/kernel/irq_work.c
+@@ -180,6 +180,7 @@ out:
+ 	return true;
+ #endif /* CONFIG_SMP */
+ }
++EXPORT_SYMBOL_GPL(irq_work_queue_on);
+ 
+ bool irq_work_needs_cpu(void)
+ {
+diff --git a/kernel/softirq.c b/kernel/softirq.c
+index 210cf5f8d..334ad6ab4 100644
+--- a/kernel/softirq.c
++++ b/kernel/softirq.c
+@@ -29,6 +29,7 @@
+ #include <linux/wait_bit.h>
+ 
+ #include <asm/softirq_stack.h>
++#include <asm/fpu/api.h>
+ 
+ #define CREATE_TRACE_POINTS
+ #include <trace/events/irq.h>
+@@ -61,7 +62,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
+ DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
+ 
+ const char * const softirq_to_name[NR_SOFTIRQS] = {
+-	"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL",
++	"HI", "TIMER", "NET_RX", "NET_TX", "BLOCK", "IRQ_POLL",
+ 	"TASKLET", "SCHED", "HRTIMER", "RCU"
+ };
+ 
+@@ -577,6 +578,10 @@ restart:
+ 		wakeup_softirqd();
+ 	}
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++	__kernel_fpu_end_bh();
++#endif
++
+ 	account_softirq_exit(current);
+ 	lockdep_softirq_end(in_hardirq);
+ 	softirq_handle_end();
+@@ -687,6 +692,7 @@ void raise_softirq(unsigned int nr)
+ 	raise_softirq_irqoff(nr);
+ 	local_irq_restore(flags);
+ }
++EXPORT_SYMBOL(raise_softirq);
+ 
+ void __raise_softirq_irqoff(unsigned int nr)
+ {
+diff --git a/mm/Makefile b/mm/Makefile
+index 4abb40b91..46fb0bd1b 100644
+--- a/mm/Makefile
++++ b/mm/Makefile
+@@ -129,6 +129,7 @@ obj-$(CONFIG_MEMFD_CREATE) += memfd.o
+ obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
+ obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
+ obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
++obj-$(CONFIG_SECURITY_TEMPESTA) += tempesta_mm.o
+ obj-$(CONFIG_IO_MAPPING) += io-mapping.o
+ obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
+ obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
+diff --git a/mm/mm_init.c b/mm/mm_init.c
+index 2c19f5515..64a26935a 100644
+--- a/mm/mm_init.c
++++ b/mm/mm_init.c
+@@ -27,6 +27,7 @@
+ #include <linux/swap.h>
+ #include <linux/cma.h>
+ #include <linux/crash_dump.h>
++#include <linux/tempesta.h>
+ #include "internal.h"
+ #include "slab.h"
+ #include "shuffle.h"
+@@ -2772,6 +2773,15 @@ void __init mm_core_init(void)
+ 	stack_depot_early_init();
+ 	mem_init();
+ 	mem_init_print_info();
++
++#ifdef CONFIG_SECURITY_TEMPESTA
++	/*
++	 * Tempesta: reserve pages just when zones are initialized
++	 * to get continous address space of huge pages.
++	 */
++	tempesta_reserve_pages();
++#endif
++
+ 	kmem_cache_init();
+ 	/*
+ 	 * page_owner must be initialized after buddy is ready, and also after
+@@ -2790,6 +2800,12 @@ void __init mm_core_init(void)
+ 	init_espfix_bsp();
+ 	/* Should be run after espfix64 is set up. */
+ 	pti_init();
++
++#ifdef CONFIG_SECURITY_TEMPESTA
++	/* Try vmalloc() if the previous one failed. */
++	tempesta_reserve_vmpages();
++#endif
++
+ 	kmsan_init_runtime();
+ 	mm_cache_init();
+ }
+diff --git a/mm/tempesta_mm.c b/mm/tempesta_mm.c
+new file mode 100644
+index 000000000..7ee3ead54
+--- /dev/null
++++ b/mm/tempesta_mm.c
+@@ -0,0 +1,274 @@
++/**
++ *		Tempesta Memory Reservation
++ *
++ * Copyright (C) 2015-2022 Tempesta Technologies, Inc.
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License,
++ * or (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful, but WITHOUT
++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++ * FOR A PARTICULAR PURPOSE.
++ * See the GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License along with
++ * this program; if not, write to the Free Software Foundation, Inc., 59
++ * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
++ */
++#include <linux/gfp.h>
++#include <linux/hugetlb.h>
++#include <linux/tempesta.h>
++#include <linux/topology.h>
++#include <linux/vmalloc.h>
++
++#include "internal.h"
++
++#define MAX_PGORDER		16	/* 128GB per one table */
++#define MIN_PGORDER		4	/* 32MB */
++#define DEFAULT_PGORDER		8	/* 512MB */
++/* Modern processors support up to 1.5TB of RAM, be ready for 2TB. */
++#define GREEDY_ARNUM		(1024 * 1024 + 1)
++#define PGNUM			(1 << pgorder)
++#define PGNUM4K			(PGNUM * (1 << HUGETLB_PAGE_ORDER))
++
++static int pgorder = DEFAULT_PGORDER;
++static gfp_t gfp_f = GFP_HIGHUSER | __GFP_COMP | __GFP_THISNODE | __GFP_ZERO
++		     | __GFP_RETRY_MAYFAIL;
++static TempestaMapping map[MAX_NUMNODES];
++/*
++ * Modern x86-64 has not more than 512GB RAM per physical node.
++ * This is very large amount of memory, but it will be freed when
++ * initialization phase ends.
++ */
++static struct page *greedy[GREEDY_ARNUM] __initdata = { 0 };
++
++static int __init
++tempesta_setup_pages(char *str)
++{
++	get_option(&str, &pgorder);
++	if (pgorder < MIN_PGORDER) {
++		pr_err("Tempesta: bad dbmem value %d, must be [%d:%d]\n",
++		       pgorder, MIN_PGORDER, MAX_PGORDER);
++		pgorder = MIN_PGORDER;
++	}
++	if (pgorder > MAX_PGORDER) {
++		pr_err("Tempesta: bad dbmem value %d, must be [%d:%d]\n",
++		       pgorder, MIN_PGORDER, MAX_PGORDER);
++		pgorder = MAX_PGORDER;
++	}
++
++	return 1;
++}
++__setup("tempesta_dbmem=", tempesta_setup_pages);
++
++/**
++ * The code is somewhat stollen from mm/hugetlb.c.
++ */
++static struct page *
++tempesta_alloc_hpage(int nid)
++{
++	struct page *p;
++
++	p = alloc_pages_node(nid, gfp_f, HUGETLB_PAGE_ORDER);
++	if (!p)
++		return NULL;
++
++	count_vm_event(HTLB_BUDDY_PGALLOC);
++
++	__ClearPageReserved(p);
++
++	return p;
++}
++
++static void
++tempesta_free_hpage(struct page *p)
++{
++	__free_pages(p, HUGETLB_PAGE_ORDER);
++}
++
++/**
++ * Greedely alloc huge pages and try to find continous region organized
++ * by sorted set of allocated pages. When the region is found, all pages
++ * out of it are returned to system.
++ */
++static struct page *
++tempesta_alloc_contmem(int nid)
++{
++	long min = -1, start = -1, curr = 0, end = -1, max = -1;
++	struct page *p;
++
++	while (1) {
++		p = tempesta_alloc_hpage(nid);
++		if (!p)
++			goto err;
++		curr = ((long)page_address(p) - PAGE_OFFSET) >> HPAGE_SHIFT;
++		/*
++		 * The first kernel mapped page is always reserved.
++		 * Keep untouched (zero) bounds for faster lookups.
++		 */
++		BUG_ON(curr < 1 || curr >= GREEDY_ARNUM);
++		greedy[curr] = p;
++
++		/* First time initialization. */
++		if (min < 0) {
++			min = start = end = max = curr;
++		} else {
++			/* Update bounds for faster pages return. */
++			if (min > curr)
++				min = curr;
++			if (max < curr)
++				max = curr;
++			/* Update continous memory segment bounds. */
++			if (curr == end + 1) {
++				while (end <= max && greedy[end + 1])
++					++end;
++			}
++			else if (curr + 1 == start) {
++				while (start >= min && greedy[start - 1])
++					--start;
++			}
++			else {
++				/* Try to find new continous segment. */
++				long i, d_max = 0, good_start = start = min;
++				for (i = min; i <= max; ++i) {
++					if (greedy[i]) {
++						if (start == -1)
++							start = i;
++						end = i;
++						if (i - start + 1 == PGNUM)
++							break;
++						continue;
++					}
++
++					if (start > 0 && end - start > d_max) {
++						good_start = start;
++						d_max = end - start;
++					}
++					start = -1;
++				}
++				if (end - start < d_max) {
++					start = good_start;
++					end = start + d_max;
++				}
++			}
++		}
++
++		if (end - start + 1 == PGNUM)
++			break; /* continous space is built! */
++	}
++
++	/* Return unnecessary pages. */
++	BUG_ON(min < 0 || start < 0 || end < 0 || max < 0);
++	for ( ; min < start; ++min)
++		if (greedy[min]) {
++			tempesta_free_hpage(greedy[min]);
++			greedy[min] = NULL;
++		}
++	for ( ; max > end; --max)
++		if (greedy[max]) {
++			tempesta_free_hpage(greedy[max]);
++			greedy[max] = NULL;
++		}
++	return greedy[start];
++
++err:
++	pr_err("Tempesta: cannot allocate %u continous huge pages at node"
++	       " %d\n", PGNUM, nid);
++	for ( ; min >= 0 && min <= max; ++min)
++		if (greedy[min]) {
++			tempesta_free_hpage(greedy[min]);
++			greedy[min] = NULL;
++		}
++	return NULL;
++}
++
++/**
++ * Allocate continous virtual space of huge pages for Tempesta.
++ * We do not use giantic 1GB pages since not all modern x86-64 CPUs
++ * allows them in virtualized mode.
++ *
++ * TODO try firstly to allocate giantic pages, next huge pages and finally
++ * fallback to common 4KB pages allocation if previous tries failed.
++ */
++void __init
++tempesta_reserve_pages(void)
++{
++	int nid;
++	struct page *p;
++
++	for_each_online_node(nid) {
++		p = tempesta_alloc_contmem(nid);
++		if (!p)
++			goto err;
++
++		map[nid].addr = (unsigned long)page_address(p);
++		map[nid].pages = PGNUM4K;
++
++		pr_info("Tempesta: allocated huge pages space %pK %luMB at node"
++			" %d\n", page_address(p),
++			PGNUM4K * PAGE_SIZE / (1024 * 1024), nid);
++	}
++
++	return;
++err:
++	for_each_online_node(nid) {
++		struct page *pend;
++		if (!map[nid].addr)
++			continue;
++		for (p = virt_to_page(map[nid].addr), pend = p + PGNUM4K;
++		     p < pend; p += 1 << HUGETLB_PAGE_ORDER)
++			tempesta_free_hpage(p);
++	}
++	memset(map, 0, sizeof(map));
++}
++
++/**
++ * Allocates necessary space if tempesta_reserve_pages() failed.
++ */
++void __init
++tempesta_reserve_vmpages(void)
++{
++	int nid, maps = 0;
++	size_t vmsize = PGNUM * (1 << HPAGE_SHIFT);
++
++	for_each_online_node(nid)
++		maps += !!map[nid].addr;
++
++	BUG_ON(maps && maps < nr_online_nodes);
++	if (maps == nr_online_nodes)
++		return;
++
++	for_each_online_node(nid) {
++		pr_warn("Tempesta: allocate %u vmalloc pages at node %d\n",
++			PGNUM4K, nid);
++
++		map[nid].addr = (unsigned long)vzalloc_node(vmsize, nid);
++		if (!map[nid].addr)
++			goto err;
++		map[nid].pages = PGNUM4K;
++	}
++
++	return;
++err:
++	pr_err("Tempesta: cannot vmalloc area of %lu bytes at node %d\n",
++	       vmsize, nid);
++	for_each_online_node(nid)
++		if (map[nid].addr)
++			vfree((void *)map[nid].addr);
++	memset(map, 0, sizeof(map));
++}
++
++int
++tempesta_get_mapping(int nid, TempestaMapping **tm)
++{
++	if (unlikely(!map[nid].addr))
++		return -ENOMEM;
++
++	*tm = &map[nid];
++
++	return 0;
++}
++EXPORT_SYMBOL(tempesta_get_mapping);
++
+diff --git a/net/core/dev.c b/net/core/dev.c
+index c365aa06f..efe607e21 100644
+--- a/net/core/dev.c
++++ b/net/core/dev.c
+@@ -4439,7 +4439,9 @@ int weight_p __read_mostly = 64;           /* old backlog weight */
+ int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
+ int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
+ int dev_rx_weight __read_mostly = 64;
++EXPORT_SYMBOL(dev_rx_weight);
+ int dev_tx_weight __read_mostly = 64;
++EXPORT_SYMBOL(dev_tx_weight);
+ 
+ /* Called with irq disabled */
+ static inline void ____napi_schedule(struct softnet_data *sd,
+@@ -5128,6 +5130,28 @@ int netif_rx(struct sk_buff *skb)
+ }
+ EXPORT_SYMBOL(netif_rx);
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++#include <linux/tempesta.h>
++
++static TempestaTxAction __rcu tempesta_tx_action = NULL;
++
++void
++tempesta_set_tx_action(TempestaTxAction action)
++{
++	rcu_assign_pointer(tempesta_tx_action, action);
++}
++EXPORT_SYMBOL(tempesta_set_tx_action);
++
++void
++tempesta_del_tx_action(void)
++{
++	rcu_assign_pointer(tempesta_tx_action, NULL);
++	synchronize_rcu();
++}
++EXPORT_SYMBOL(tempesta_del_tx_action);
++#endif
++
++
+ static __latent_entropy void net_tx_action(struct softirq_action *h)
+ {
+ 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
+@@ -5160,6 +5184,20 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
+ 		}
+ 	}
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++	{
++		TempestaTxAction action;
++
++		rcu_read_lock();
++
++		action = rcu_dereference(tempesta_tx_action);
++		if (likely(action))
++			action();
++
++		rcu_read_unlock();
++	}
++#endif
++
+ 	if (sd->output_queue) {
+ 		struct Qdisc *head;
+ 
+diff --git a/net/core/request_sock.c b/net/core/request_sock.c
+index 63de5c635..76e222130 100644
+--- a/net/core/request_sock.c
++++ b/net/core/request_sock.c
+@@ -127,3 +127,4 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
+ out:
+ 	spin_unlock_bh(&fastopenq->lock);
+ }
++EXPORT_SYMBOL(reqsk_fastopen_remove);
+diff --git a/net/core/skbuff.c b/net/core/skbuff.c
+index 71dee435d..c6b29cfc3 100644
+--- a/net/core/skbuff.c
++++ b/net/core/skbuff.c
+@@ -89,7 +89,9 @@
+ #include "sock_destructor.h"
+ 
+ struct kmem_cache *skbuff_cache __ro_after_init;
++#ifndef CONFIG_SECURITY_TEMPESTA
+ static struct kmem_cache *skbuff_fclone_cache __ro_after_init;
++#endif
+ #ifdef CONFIG_SKB_EXTENSIONS
+ static struct kmem_cache *skbuff_ext_cache __ro_after_init;
+ #endif
+@@ -540,6 +542,7 @@ struct sk_buff *napi_build_skb(void *data, unsigned int frag_size)
+ }
+ EXPORT_SYMBOL(napi_build_skb);
+ 
++#ifndef CONFIG_SECURITY_TEMPESTA
+ /*
+  * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
+  * the caller if emergency pfmemalloc reserves are being used. If it is and
+@@ -595,6 +598,219 @@ out:
+ 
+ 	return obj;
+ }
++#endif
++
++#ifdef CONFIG_SECURITY_TEMPESTA
++static void kmalloc_reserve_size(unsigned int *size, gfp_t flags, int node,
++			     bool *pfmemalloc)
++{
++	bool ret_pfmemalloc = false;
++	size_t obj_size;
++
++	obj_size = SKB_HEAD_ALIGN(*size);
++	if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE &&
++	    !(flags & KMALLOC_NOT_NORMAL_BITS)) {
++		*size = SKB_SMALL_HEAD_CACHE_SIZE;
++		if (!gfp_pfmemalloc_allowed(flags))
++			goto out;
++		/* Try again but now we are using pfmemalloc reserves */
++		ret_pfmemalloc = true;
++		goto out;
++	}
++
++	obj_size = kmalloc_size_roundup(obj_size);
++	/* The following cast might truncate high-order bits of obj_size, this
++	 * is harmless because kmalloc(obj_size >= 2^32) will fail anyway.
++	 */
++	*size = (unsigned int)obj_size;
++
++	/*
++	 * Try a regular allocation, when that fails and we're not entitled
++	 * to the reserves, fail.
++	 */
++	if (!gfp_pfmemalloc_allowed(flags))
++		goto out;
++
++	/* Try again but now we are using pfmemalloc reserves */
++	ret_pfmemalloc = true;
++
++out:
++	if (pfmemalloc)
++		*pfmemalloc = ret_pfmemalloc;
++}
++#endif
++
++/*
++ * Chunks of size 128B, 256B, 512B, 1KB and 2KB.
++ * Typical sk_buff requires ~272B or ~552B (for fclone),
++ * skb_shared_info is ~320B.
++ */
++#define PG_LISTS_N		5
++#define PG_CHUNK_BITS		(PAGE_SHIFT - 5)
++#define PG_CHUNK_SZ		(1 << PG_CHUNK_BITS)
++#define PG_CHUNK_MASK		(~(PG_CHUNK_SZ - 1))
++#define PG_ALLOC_SZ(s)		(((s) + (PG_CHUNK_SZ - 1)) & PG_CHUNK_MASK)
++#define PG_CHUNK_NUM(s)		(PG_ALLOC_SZ(s) >> PG_CHUNK_BITS)
++#define PG_POOL_HLIM_BASE	256
++
++/**
++ * @lh		- list head of chunk pool;
++ * @count	- current number of chunks in @lh;
++ * @h_limit	- hard limit for size of @lh;
++ * @max		- current maximum allowed size of the list, can be 0.
++ */
++typedef struct {
++	struct list_head	lh;
++	unsigned int		count;
++	unsigned int		h_limit;
++	unsigned int		max;
++} TfwSkbMemPool;
++
++static DEFINE_PER_CPU(TfwSkbMemPool [PG_LISTS_N], pg_mpool);
++
++static bool
++__pg_pool_grow(TfwSkbMemPool *pool)
++{
++	if (!pool->count) {
++		/* Too few chunks were provisioned. */
++		unsigned int n = max(pool->max, 1U) << 1; /* start from 2 */
++		pool->max = (n > pool->h_limit) ? pool->h_limit : n;
++		return false;
++	}
++	if (pool->max < pool->h_limit)
++		++pool->max;
++	return true;
++}
++
++static bool
++__pg_pool_shrink(TfwSkbMemPool *pool)
++{
++	if (unlikely(pool->count >= pool->max)) {
++		/* Producers are much faster consumers right now. */
++		pool->max >>= 1;
++		while (pool->count > pool->max) {
++			struct list_head *pc = pool->lh.next;
++			list_del(pc);
++			put_page(virt_to_page(pc));
++			--pool->count;
++		}
++		return false;
++	}
++	/*
++	 * Producers and consumers look balanced.
++	 * Slowly reduce provisioning.
++	 */
++	if (pool->max)
++		--pool->max;
++	return true;
++}
++
++void *
++pg_skb_alloc(unsigned int size, gfp_t gfp_mask, int node)
++{
++	/*
++	 * Don't disable softirq if hardirqs are already disabled to avoid
++	 * warning in __local_bh_enable_ip(). Disable user space process
++	 * preemption as well as preemption by softirq (see SOFTIRQ_LOCK_OFFSET
++	 * usage in spin locks for the same motivation).
++	 */
++	bool dolock = !(in_irq() || irqs_disabled());
++#define PREEMPT_CTX_DISABLE()						\
++do {									\
++	if (dolock)							\
++		local_bh_disable();					\
++	preempt_disable();						\
++} while (0)
++
++#define PREEMPT_CTX_ENABLE()						\
++do {									\
++	preempt_enable();						\
++	if (dolock)							\
++		local_bh_enable();					\
++} while (0)
++
++	char *ptr;
++	struct page *pg;
++	TfwSkbMemPool *pools;
++	unsigned int c, cn, o, l, po;
++
++	cn = PG_CHUNK_NUM(size);
++	po = get_order(PG_ALLOC_SZ(size));
++
++	PREEMPT_CTX_DISABLE();
++
++	pools = this_cpu_ptr(pg_mpool);
++
++	o = (cn == 1) ? 0
++	    : (cn == 2) ? 1
++	      : (cn <= 4) ? 2
++	        : (cn <= 8) ? 3
++		  : (cn <= 16) ? 4 : PG_LISTS_N;
++
++	for (; o < PG_LISTS_N; ++o)
++	{
++		struct list_head *pc;
++		if (!__pg_pool_grow(&pools[o]))
++			continue;
++
++		pc = pools[o].lh.next;
++		list_del(pc);
++		--pools[o].count;
++		ptr = (char *)pc;
++		pg = virt_to_page(ptr);
++		goto assign_tail_chunks;
++	}
++
++	PREEMPT_CTX_ENABLE();
++
++	/*
++	 * Add compound page metadata, if page order is > 0.
++	 * Don't use __GFP_NOMEMALLOC to allow caller access to reserved pools if
++	 * it requested so.
++	 */
++	gfp_mask |= __GFP_NOWARN | __GFP_NORETRY | (po ? __GFP_COMP : 0);
++	pg = alloc_pages_node(node, gfp_mask, po);
++	if (!pg)
++		return NULL;
++	ptr = (char *)page_address(pg);
++	/*
++	 * Don't try to split compound page. Also don't try to reuse pages
++	 * from reserved memory areas to put and free them quicker.
++	 *
++	 * TODO compound pages can be split as __alloc_page_frag() does it
++	 * using fragment size in page reference counter. Large messages
++	 * (e.g. large HTML pages returned by a backend server) go this way
++	 * and allocate compound pages.
++	 */
++	if (po || page_is_pfmemalloc(pg))
++		return ptr;
++	o = PAGE_SHIFT - PG_CHUNK_BITS;
++
++	PREEMPT_CTX_DISABLE();
++
++	pools = this_cpu_ptr(pg_mpool);
++
++assign_tail_chunks:
++	/* Split and store small tail chunks. */
++	for (c = cn, cn = 1 << o, l = PG_LISTS_N - 1; c < cn; c += (1 << l)) {
++		struct list_head *chunk;
++		while (c + (1 << l) > cn)
++			--l;
++		chunk = (struct list_head *)(ptr + PG_CHUNK_SZ * c);
++		if (__pg_pool_shrink(&pools[l])) {
++			get_page(pg);
++			list_add(chunk, &pools[l].lh);
++			++pools[l].count;
++		}
++	}
++
++	PREEMPT_CTX_ENABLE();
++
++	return ptr;
++#undef PREEMPT_CTX_DISABLE
++#undef PREEMPT_CTX_ENABLE
++}
++EXPORT_SYMBOL(pg_skb_alloc);
+ 
+ /* 	Allocate a new skbuff. We do this ourselves so we can fill in a few
+  *	'private' fields and also do memory statistics to find all the
+@@ -622,27 +838,47 @@ out:
+ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
+ 			    int flags, int node)
+ {
++#ifndef CONFIG_SECURITY_TEMPESTA
+ 	struct kmem_cache *cache;
++#endif
+ 	struct sk_buff *skb;
+ 	bool pfmemalloc;
+ 	u8 *data;
++#ifdef CONFIG_SECURITY_TEMPESTA
++	size_t skb_sz = (flags & SKB_ALLOC_FCLONE)
++			? SKB_DATA_ALIGN(sizeof(struct sk_buff_fclones))
++			: SKB_DATA_ALIGN(sizeof(struct sk_buff));
++	struct page *pg;
++#endif
+ 
++#ifndef CONFIG_SECURITY_TEMPESTA
+ 	cache = (flags & SKB_ALLOC_FCLONE)
+ 		? skbuff_fclone_cache : skbuff_cache;
++#endif
+ 
+ 	if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
+ 		gfp_mask |= __GFP_MEMALLOC;
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++	kmalloc_reserve_size(&size, gfp_mask, node, &pfmemalloc);
++	if (!(skb = pg_skb_alloc(skb_sz + size, gfp_mask, node)))
++		return NULL;
++	data = (u8 *)skb + skb_sz;
++	pg = virt_to_head_page(data);
++	get_page(pg);
++#else
+ 	/* Get the HEAD */
+ 	if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI &&
+ 	    likely(node == NUMA_NO_NODE || node == numa_mem_id()))
+ 		skb = napi_skb_cache_get();
+ 	else
+ 		skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node);
++#endif
+ 	if (unlikely(!skb))
+ 		return NULL;
+ 	prefetchw(skb);
+ 
++#ifndef CONFIG_SECURITY_TEMPESTA
+ 	/* We do our best to align skb_shared_info on a separate cache
+ 	 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
+ 	 * aligned memory blocks, unless SLUB/SLAB debug is enabled.
+@@ -651,6 +887,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
+ 	data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc);
+ 	if (unlikely(!data))
+ 		goto nodata;
++#endif
+ 	/* kmalloc_size_roundup() might give us more room than requested.
+ 	 * Put skb_shared_info exactly at the end of allocated zone,
+ 	 * to allow max possible filling before reallocation.
+@@ -665,6 +902,10 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
+ 	memset(skb, 0, offsetof(struct sk_buff, tail));
+ 	__build_skb_around(skb, data, size);
+ 	skb->pfmemalloc = pfmemalloc;
++#ifdef CONFIG_SECURITY_TEMPESTA
++	skb->head_frag = 1;
++	skb->skb_page = 1;
++#endif
+ 
+ 	if (flags & SKB_ALLOC_FCLONE) {
+ 		struct sk_buff_fclones *fclones;
+@@ -673,12 +914,19 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
+ 
+ 		skb->fclone = SKB_FCLONE_ORIG;
+ 		refcount_set(&fclones->fclone_ref, 1);
++
++#ifdef CONFIG_SECURITY_TEMPESTA
++		fclones->skb2.skb_page = 1;
++		fclones->skb2.head_frag = 1;
++#endif
+ 	}
+ 
+ 	return skb;
+ 
++#ifndef CONFIG_SECURITY_TEMPESTA
+ nodata:
+ 	kmem_cache_free(cache, skb);
++#endif
+ 	return NULL;
+ }
+ EXPORT_SYMBOL(__alloc_skb);
+@@ -1048,7 +1296,12 @@ static void kfree_skbmem(struct sk_buff *skb)
+ 
+ 	switch (skb->fclone) {
+ 	case SKB_FCLONE_UNAVAILABLE:
+-		kmem_cache_free(skbuff_cache, skb);
++#ifdef CONFIG_SECURITY_TEMPESTA
++		if (skb->skb_page)
++			put_page(virt_to_page(skb));
++		else
++#endif
++			kmem_cache_free(skbuff_cache, skb);
+ 		return;
+ 
+ 	case SKB_FCLONE_ORIG:
+@@ -1069,7 +1322,12 @@ static void kfree_skbmem(struct sk_buff *skb)
+ 	if (!refcount_dec_and_test(&fclones->fclone_ref))
+ 		return;
+ fastpath:
++#ifdef CONFIG_SECURITY_TEMPESTA
++	BUG_ON(!skb->skb_page);
++	put_page(virt_to_page(skb));
++#else
+ 	kmem_cache_free(skbuff_fclone_cache, fclones);
++#endif
+ }
+ 
+ void skb_release_head_state(struct sk_buff *skb)
+@@ -1156,6 +1414,13 @@ static void kfree_skb_add_bulk(struct sk_buff *skb,
+ 			       struct skb_free_array *sa,
+ 			       enum skb_drop_reason reason)
+ {
++#ifdef CONFIG_SECURITY_TEMPESTA
++	if (likely(skb->skb_page)) {
++		__kfree_skb(skb);
++		return;
++	}
++#endif
++
+ 	/* if SKB is a clone, don't handle this case */
+ 	if (unlikely(skb->fclone != SKB_FCLONE_UNAVAILABLE)) {
+ 		__kfree_skb(skb);
+@@ -1345,6 +1610,17 @@ static void napi_skb_cache_put(struct sk_buff *skb)
+ 	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ 	u32 i;
+ 
++	/*
++	 * Tempesta uses its own fast page allocator for socket buffers,
++	 * so no need to use napi_alloc_cache for paged skbs.
++	 */
++#ifdef CONFIG_SECURITY_TEMPESTA
++	if (skb->skb_page) {
++		put_page(virt_to_page(skb));
++		return;
++	}
++#endif
++
+ 	if (!kasan_mempool_poison_object(skb))
+ 		return;
+ 
+@@ -1376,7 +1652,12 @@ void napi_skb_free_stolen_head(struct sk_buff *skb)
+ 		skb_orphan(skb);
+ 		skb->slow_gro = 0;
+ 	}
+-	napi_skb_cache_put(skb);
++#ifdef CONFIG_SECURITY_TEMPESTA
++	if (skb->skb_page)
++		put_page(virt_to_page(skb));
++	else
++#endif
++		napi_skb_cache_put(skb);
+ }
+ 
+ void napi_consume_skb(struct sk_buff *skb, int budget)
+@@ -1470,6 +1751,9 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
+ 	n->sk = NULL;
+ 	__copy_skb_header(n, skb);
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++	C(tail_lock);
++#endif
+ 	C(len);
+ 	C(data_len);
+ 	C(mac_len);
+@@ -1946,6 +2230,10 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
+ 	    refcount_read(&fclones->fclone_ref) == 1) {
+ 		n = &fclones->skb2;
+ 		refcount_set(&fclones->fclone_ref, 2);
++#ifdef CONFIG_SECURITY_TEMPESTA
++		BUG_ON(!skb->skb_page);
++		BUG_ON(!n->skb_page);
++#endif
+ 		n->fclone = SKB_FCLONE_CLONE;
+ 	} else {
+ 		if (skb_pfmemalloc(skb))
+@@ -1956,6 +2244,9 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
+ 			return NULL;
+ 
+ 		n->fclone = SKB_FCLONE_UNAVAILABLE;
++#ifdef CONFIG_SECURITY_TEMPESTA
++		n->skb_page = 0;
++#endif
+ 	}
+ 
+ 	return __skb_clone(n, skb);
+@@ -2133,10 +2424,18 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
+ 	if (skb_pfmemalloc(skb))
+ 		gfp_mask |= __GFP_MEMALLOC;
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++	kmalloc_reserve_size(&size, gfp_mask, NUMA_NO_NODE, NULL);
++	data = pg_skb_alloc(size, gfp_mask, NUMA_NO_NODE);
++	if (!data)
++		goto nodata;
++	size = SKB_WITH_OVERHEAD(PG_ALLOC_SZ(size));
++#else
+ 	data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
+ 	if (!data)
+ 		goto nodata;
+ 	size = SKB_WITH_OVERHEAD(size);
++#endif
+ 
+ 	/* Copy only real data... and, alas, header. This should be
+ 	 * optimized for the cases when header is void.
+@@ -2170,7 +2469,12 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
+ 	off = (data + nhead) - skb->head;
+ 
+ 	skb->head     = data;
++#ifdef CONFIG_SECURITY_TEMPESTA
++	skb->head_frag = 1;
++	skb->tail_lock = 0;
++#else
+ 	skb->head_frag = 0;
++#endif
+ 	skb->data    += off;
+ 
+ 	skb_set_end_offset(skb, size);
+@@ -2196,7 +2500,11 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
+ 	return 0;
+ 
+ nofrags:
++#ifdef CONFIG_SECURITY_TEMPESTA
++	put_page(virt_to_page(data));
++#else
+ 	skb_kfree_head(data, size);
++#endif
+ nodata:
+ 	return -ENOMEM;
+ }
+@@ -2404,7 +2712,11 @@ int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error)
+ 		return 0;
+ 	}
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++	ntail = skb->data_len + pad - skb_tailroom_locked(skb);
++#else
+ 	ntail = skb->data_len + pad - (skb->end - skb->tail);
++#endif
+ 	if (likely(skb_cloned(skb) || ntail > 0)) {
+ 		err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC);
+ 		if (unlikely(err))
+@@ -2687,7 +2999,13 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta)
+ 	 * plus 128 bytes for future expansions. If we have enough
+ 	 * room at tail, reallocate without expansion only if skb is cloned.
+ 	 */
+-	int i, k, eat = (skb->tail + delta) - skb->end;
++	int i, k, eat;
++
++#ifdef CONFIG_SECURITY_TEMPESTA
++	eat = delta - skb_tailroom_locked(skb);
++#else
++	eat = (skb->tail + delta) - skb->end;
++#endif
+ 
+ 	if (eat > 0 || skb_cloned(skb)) {
+ 		if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
+@@ -4889,6 +5207,25 @@ static void skb_extensions_init(void) {}
+ 
+ void __init skb_init(void)
+ {
++#ifdef CONFIG_SECURITY_TEMPESTA
++	int cpu, l;
++	for_each_possible_cpu(cpu)
++		for (l = 0; l < PG_LISTS_N; ++l) {
++			TfwSkbMemPool *pool = per_cpu_ptr(&pg_mpool[l], cpu);
++			INIT_LIST_HEAD(&pool->lh);
++			/*
++			 * Large chunks are also can be used to get smaller
++			 * chunks, so we cache them more aggressively.
++			 */
++			pool->h_limit = PG_POOL_HLIM_BASE << l;
++		}
++#else
++	skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
++						sizeof(struct sk_buff_fclones),
++						0,
++						SLAB_HWCACHE_ALIGN|SLAB_PANIC,
++						NULL);
++#endif
+ 	skbuff_cache = kmem_cache_create_usercopy("skbuff_head_cache",
+ 					      sizeof(struct sk_buff),
+ 					      0,
+@@ -4897,11 +5234,6 @@ void __init skb_init(void)
+ 					      offsetof(struct sk_buff, cb),
+ 					      sizeof_field(struct sk_buff, cb),
+ 					      NULL);
+-	skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
+-						sizeof(struct sk_buff_fclones),
+-						0,
+-						SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+-						NULL);
+ 	/* usercopy should only access first SKB_SMALL_HEAD_HEADROOM bytes.
+ 	 * struct skb_shared_info is located at the end of skb->head,
+ 	 * and should not be copied to/from user.
+@@ -5779,7 +6111,15 @@ void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)
+ {
+ 	if (head_stolen) {
+ 		skb_release_head_state(skb);
++#ifdef CONFIG_SECURITY_TEMPESTA
++		/*
++		 * fclones are possible here with Tempesta due to using
++		 * pskb_copy_for_clone() in ss_send().
++		 */
++		kfree_skbmem(skb);
++#else
+ 		kmem_cache_free(skbuff_cache, skb);
++#endif
+ 	} else {
+ 		__kfree_skb(skb);
+ 	}
+@@ -6442,10 +6782,18 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
+ 	if (skb_pfmemalloc(skb))
+ 		gfp_mask |= __GFP_MEMALLOC;
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++	kmalloc_reserve_size(&size, gfp_mask, NUMA_NO_NODE, NULL);
++	data = pg_skb_alloc(size, gfp_mask, NUMA_NO_NODE);
++	if (!data)
++		return -ENOMEM;
++	size = SKB_WITH_OVERHEAD(PG_ALLOC_SZ(size));
++#else
+ 	data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
+ 	if (!data)
+ 		return -ENOMEM;
+ 	size = SKB_WITH_OVERHEAD(size);
++#endif
+ 
+ 	/* Copy real data, and all frags */
+ 	skb_copy_from_linear_data_offset(skb, off, data, new_hlen);
+@@ -6458,7 +6806,11 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
+ 	if (skb_cloned(skb)) {
+ 		/* drop the old head gracefully */
+ 		if (skb_orphan_frags(skb, gfp_mask)) {
++#ifdef CONFIG_SECURITY_TEMPESTA
++			skb_free_frag(data);
++#else
+ 			skb_kfree_head(data, size);
++#endif
+ 			return -ENOMEM;
+ 		}
+ 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+@@ -6475,7 +6827,11 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
+ 
+ 	skb->head = data;
+ 	skb->data = data;
++#ifdef CONFIG_SECURITY_TEMPESTA
++	skb->head_frag = 1;
++#else
+ 	skb->head_frag = 0;
++#endif
+ 	skb_set_end_offset(skb, size);
+ 	skb_set_tail_pointer(skb, skb_headlen(skb));
+ 	skb_headers_offset_update(skb, 0);
+@@ -6558,15 +6914,27 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
+ 	if (skb_pfmemalloc(skb))
+ 		gfp_mask |= __GFP_MEMALLOC;
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++	kmalloc_reserve_size(&size, gfp_mask, NUMA_NO_NODE, NULL);
++	data = pg_skb_alloc(size, gfp_mask, NUMA_NO_NODE);
++	if (!data)
++		return -ENOMEM;
++	size = SKB_WITH_OVERHEAD(PG_ALLOC_SZ(size));
++#else
+ 	data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
+ 	if (!data)
+ 		return -ENOMEM;
+ 	size = SKB_WITH_OVERHEAD(size);
++#endif
+ 
+ 	memcpy((struct skb_shared_info *)(data + size),
+ 	       skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0]));
+ 	if (skb_orphan_frags(skb, gfp_mask)) {
++#ifdef CONFIG_SECURITY_TEMPESTA
++		skb_free_frag(data);
++#else
+ 		skb_kfree_head(data, size);
++#endif
+ 		return -ENOMEM;
+ 	}
+ 	shinfo = (struct skb_shared_info *)(data + size);
+@@ -6602,13 +6970,21 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
+ 		/* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */
+ 		if (skb_has_frag_list(skb))
+ 			kfree_skb_list(skb_shinfo(skb)->frag_list);
++#ifdef CONFIG_SECURITY_TEMPESTA
++		skb_free_frag(data);
++#else
+ 		skb_kfree_head(data, size);
++#endif
+ 		return -ENOMEM;
+ 	}
+ 	skb_release_data(skb, SKB_CONSUMED, false);
+ 
+ 	skb->head = data;
++#ifdef CONFIG_SECURITY_TEMPESTA
++	skb->head_frag = 1;
++#else
+ 	skb->head_frag = 0;
++#endif
+ 	skb->data = data;
+ 	skb_set_end_offset(skb, size);
+ 	skb_reset_tail_pointer(skb);
+diff --git a/net/core/sock.c b/net/core/sock.c
+index 9cf404e80..c43642b17 100644
+--- a/net/core/sock.c
++++ b/net/core/sock.c
+@@ -284,6 +284,7 @@ EXPORT_SYMBOL(sysctl_rmem_max);
+ __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
+ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
+ int sysctl_mem_pcpu_rsv __read_mostly = SK_MEMORY_PCPU_RESERVE;
++EXPORT_SYMBOL(sysctl_mem_pcpu_rsv);
+ 
+ int sysctl_tstamp_allow_data __read_mostly = 1;
+ 
+diff --git a/net/core/stream.c b/net/core/stream.c
+index b16dfa568..65ba00d1f 100644
+--- a/net/core/stream.c
++++ b/net/core/stream.c
+@@ -83,16 +83,6 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
+ }
+ EXPORT_SYMBOL(sk_stream_wait_connect);
+ 
+-/**
+- * sk_stream_closing - Return 1 if we still have things to send in our buffers.
+- * @sk: socket to verify
+- */
+-static int sk_stream_closing(const struct sock *sk)
+-{
+-	return (1 << READ_ONCE(sk->sk_state)) &
+-	       (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK);
+-}
+-
+ void sk_stream_wait_close(struct sock *sk, long timeout)
+ {
+ 	if (timeout) {
+diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
+index d1492c649..c686ed928 100644
+--- a/net/ipv4/inet_connection_sock.c
++++ b/net/ipv4/inet_connection_sock.c
+@@ -1323,6 +1323,14 @@ struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
+ {
+ 	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++	if (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_TEMPESTA)) {
++		/* Tempesta doesn't use accept queue, just put the request. */
++		reqsk_put(req);
++		return child;
++	}
++#endif
++
+ 	spin_lock(&queue->rskq_lock);
+ 	if (unlikely(sk->sk_state != TCP_LISTEN)) {
+ 		inet_child_forget(sk, req, child);
+diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
+index 4e470f184..69acde396 100644
+--- a/net/ipv4/inet_hashtables.c
++++ b/net/ipv4/inet_hashtables.c
+@@ -1078,7 +1078,8 @@ other_parity_scan:
+ 		goto ok;
+ next_port:
+ 		spin_unlock_bh(&head->lock);
+-		cond_resched();
++		if (!in_serving_softirq())
++			cond_resched();
+ 	}
+ 
+ 	if (!local_ports) {
+diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
+index 67d846622..c54232188 100644
+--- a/net/ipv4/ip_output.c
++++ b/net/ipv4/ip_output.c
+@@ -83,6 +83,9 @@
+ #include <linux/netfilter_bridge.h>
+ #include <linux/netlink.h>
+ #include <linux/tcp.h>
++#ifdef CONFIG_SECURITY_TEMPESTA
++#include <net/tcp.h>
++#endif
+ 
+ static int
+ ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+@@ -530,6 +533,15 @@ packet_routed:
+ 
+ 	/* TODO : should we use skb->sk here instead of sk ? */
+ 	skb->priority = READ_ONCE(sk->sk_priority);
++#ifdef CONFIG_SECURITY_TEMPESTA
++	/*
++	 * Tempesta can set skb->mark for some skbs. And moreover
++	 * sk_mark is never set for Tempesta sockets.
++	 */
++	if (sock_flag(sk, SOCK_TEMPESTA))
++		WARN_ON_ONCE(sk->sk_mark);
++	else
++#endif
+ 	skb->mark = READ_ONCE(sk->sk_mark);
+ 
+ 	res = ip_local_out(net, sk, skb);
+@@ -692,7 +704,31 @@ struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state)
+ 	}
+ 
+ 	/* Allocate buffer */
++#ifdef CONFIG_SECURITY_TEMPESTA
++	/*
++	 * Since Tempesta FW tries to reuse incoming SKBs containing the response
++	 * from the backend, sometimes we might encounter an SKB with quite a small
++	 * head room, which is not big enough to accommodate all the transport headers
++	 * and TLS overhead.
++	 * It usually the case when working over loopback, tun/tap, bridge or similar
++	 * interfaces with small MTU. The issue is specific to aforementioned ifaces
++	 * because the outgoing SKB would be injected back to the stack.
++	 * In order not to reallocate sk_buffs' headroom on RX path,
++	 * allocate and reserve a little bit more memory on TX path.
++	 * Even though it would introduce some memory overhead, it's still
++	 * cheaper than doing transformation.
++	 *
++	 * It seems like no such actions are required for IPv6 counterparts:
++	 * ip6_fragment() / ip6_frag_next() due to the fact that the
++	 * lowest acceptable MTU (1280) is sufficient to fit all the headers.
++	 *
++	 * When receiving SKBs from the outter world, the NIC driver should
++	 * allocate and reserve all necessary space by itself.
++	 */
++	skb2 = alloc_skb(len + state->hlen + MAX_TCP_HEADER, GFP_ATOMIC);
++#else
+ 	skb2 = alloc_skb(len + state->hlen + state->ll_rs, GFP_ATOMIC);
++#endif
+ 	if (!skb2)
+ 		return ERR_PTR(-ENOMEM);
+ 
+@@ -701,7 +737,11 @@ struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state)
+ 	 */
+ 
+ 	ip_copy_metadata(skb2, skb);
++#ifdef CONFIG_SECURITY_TEMPESTA
++	skb_reserve(skb2, MAX_TCP_HEADER);
++#else
+ 	skb_reserve(skb2, state->ll_rs);
++#endif
+ 	skb_put(skb2, len + state->hlen);
+ 	skb_reset_network_header(skb2);
+ 	skb2->transport_header = skb2->network_header + state->hlen;
+diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
+index 5887eac87..805d2561d 100644
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -459,7 +459,9 @@ void tcp_init_sock(struct sock *sk)
+ 	WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]));
+ 	tcp_scaling_ratio_init(sk);
+ 
+-	set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
++	if (sk->sk_socket)
++		set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
++
+ 	sk_sockets_allocated_inc(sk);
+ }
+ EXPORT_SYMBOL(tcp_init_sock);
+@@ -653,6 +655,7 @@ void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
+ 	TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
+ 	tp->pushed_seq = tp->write_seq;
+ }
++EXPORT_SYMBOL(tcp_mark_push);
+ 
+ static inline bool forced_push(const struct tcp_sock *tp)
+ {
+@@ -666,7 +669,15 @@ void tcp_skb_entail(struct sock *sk, struct sk_buff *skb)
+ 
+ 	tcb->seq     = tcb->end_seq = tp->write_seq;
+ 	tcb->tcp_flags = TCPHDR_ACK;
+-	__skb_header_release(skb);
++
++	/*
++	 * fclones are possible here, so accurately update
++	 * skb_shinfo(skb)->dataref.
++	 */
++	BUG_ON(skb->nohdr);
++	skb->nohdr = 1;
++	atomic_add(1 << SKB_DATAREF_SHIFT, &skb_shinfo(skb)->dataref);
++
+ 	tcp_add_write_queue_tail(sk, skb);
+ 	sk_wmem_queued_add(sk, skb->truesize);
+ 	sk_mem_charge(sk, skb->truesize);
+@@ -675,6 +686,7 @@ void tcp_skb_entail(struct sock *sk, struct sk_buff *skb)
+ 
+ 	tcp_slow_start_after_idle_check(sk);
+ }
++EXPORT_SYMBOL(tcp_skb_entail);
+ 
+ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
+ {
+@@ -736,6 +748,7 @@ void tcp_push(struct sock *sk, int flags, int mss_now,
+ 
+ 	__tcp_push_pending_frames(sk, mss_now, nonagle);
+ }
++EXPORT_SYMBOL(tcp_push);
+ 
+ static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
+ 				unsigned int offset, size_t len)
+@@ -893,6 +906,7 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp,
+ 	}
+ 	return NULL;
+ }
++EXPORT_SYMBOL(tcp_stream_alloc_skb);
+ 
+ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
+ 				       int large_allowed)
+@@ -927,6 +941,7 @@ int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
+ 
+ 	return mss_now;
+ }
++EXPORT_SYMBOL(tcp_send_mss);
+ 
+ /* In some cases, sendmsg() could have added an skb to the write queue,
+  * but failed adding payload on it. We need to remove it to consume less
+@@ -1513,6 +1528,7 @@ static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
+ 	}
+ 	__kfree_skb(skb);
+ }
++EXPORT_SYMBOL(tcp_cleanup_rbuf);
+ 
+ struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
+ {
+@@ -2682,7 +2698,7 @@ static const unsigned char new_state[16] = {
+   [TCP_NEW_SYN_RECV]	= TCP_CLOSE,	/* should not happen ! */
+ };
+ 
+-static int tcp_close_state(struct sock *sk)
++int tcp_close_state(struct sock *sk)
+ {
+ 	int next = (int)new_state[sk->sk_state];
+ 	int ns = next & TCP_STATE_MASK;
+@@ -2691,6 +2707,7 @@ static int tcp_close_state(struct sock *sk)
+ 
+ 	return next & TCP_ACTION_FIN;
+ }
++EXPORT_SYMBOL(tcp_close_state);
+ 
+ /*
+  *	Shutdown the sending side of a connection. Much like close except
+@@ -2726,6 +2743,7 @@ int tcp_orphan_count_sum(void)
+ 
+ 	return max(total, 0);
+ }
++EXPORT_SYMBOL(tcp_check_oom);
+ 
+ static int tcp_orphan_cache;
+ static struct timer_list tcp_orphan_timer;
+@@ -2977,6 +2995,7 @@ void tcp_write_queue_purge(struct sock *sk)
+ 	tcp_sk(sk)->packets_out = 0;
+ 	inet_csk(sk)->icsk_backoff = 0;
+ }
++EXPORT_SYMBOL_GPL(tcp_write_queue_purge);
+ 
+ int tcp_disconnect(struct sock *sk, int flags)
+ {
+@@ -4507,10 +4526,15 @@ void tcp_done(struct sock *sk)
+ 
+ 	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
+ 
+-	if (!sock_flag(sk, SOCK_DEAD))
++	if (!sock_flag(sk, SOCK_DEAD)) {
+ 		sk->sk_state_change(sk);
+-	else
++	} else {
++#ifdef CONFIG_SECURITY_TEMPESTA
++		if (sk->sk_destroy_cb)
++			sk->sk_destroy_cb(sk);
++#endif
+ 		inet_csk_destroy_sock(sk);
++	}
+ }
+ EXPORT_SYMBOL_GPL(tcp_done);
+ 
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index df7b13f0e..133c90683 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -720,6 +720,7 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
+ 			tcp_rcv_rtt_update(tp, delta, 0);
+ 	}
+ }
++EXPORT_SYMBOL(tcp_rcv_space_adjust);
+ 
+ /*
+  * This function should be called every time data is copied to user space.
+@@ -5370,9 +5371,20 @@ restart:
+ 		int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
+ 		struct sk_buff *nskb;
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++		/*
++		 * This skb can be reused by Tempesta FW. Thus allocate
++		 * space for TCP headers.
++		 */
++		nskb = alloc_skb(copy + MAX_TCP_HEADER, GFP_ATOMIC);
++#else
+ 		nskb = alloc_skb(copy, GFP_ATOMIC);
++#endif
+ 		if (!nskb)
+ 			break;
++#ifdef CONFIG_SECURITY_TEMPESTA
++		skb_reserve(nskb, MAX_TCP_HEADER);
++#endif
+ 
+ 		memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
+ #ifdef CONFIG_TLS_DEVICE
+diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
+index 0c50c5a32..fdc0004ee 100644
+--- a/net/ipv4/tcp_ipv4.c
++++ b/net/ipv4/tcp_ipv4.c
+@@ -58,6 +58,7 @@
+ #include <linux/times.h>
+ #include <linux/slab.h>
+ #include <linux/sched.h>
++#include <linux/tempesta.h>
+ 
+ #include <net/net_namespace.h>
+ #include <net/icmp.h>
+@@ -219,8 +220,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+ 		return -EAFNOSUPPORT;
+ 
+ 	nexthop = daddr = usin->sin_addr.s_addr;
+-	inet_opt = rcu_dereference_protected(inet->inet_opt,
+-					     lockdep_sock_is_held(sk));
++	inet_opt = rcu_dereference_raw(inet->inet_opt);
+ 	if (inet_opt && inet_opt->opt.srr) {
+ 		if (!daddr)
+ 			return -EINVAL;
+@@ -1292,8 +1292,7 @@ static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
+ 	const struct tcp_md5sig_info *md5sig;
+ 
+ 	/* caller either holds rcu_read_lock() or socket lock */
+-	md5sig = rcu_dereference_check(tp->md5sig_info,
+-				       lockdep_sock_is_held(sk));
++	md5sig = rcu_dereference_raw(tp->md5sig_info);
+ 	if (!md5sig)
+ 		return NULL;
+ #if IS_ENABLED(CONFIG_IPV6)
+@@ -1810,6 +1809,18 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
+ 		goto put_and_exit; /* OOM, release back memory */
+ #endif
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++	/*
++	 * We need already initialized socket addresses,
++	 * so there is no appropriate security hook.
++	 */
++	if (tempesta_new_clntsk(newsk, skb)) {
++		tcp_v4_send_reset(newsk, skb);
++		tempesta_close_clntsk(newsk);
++		ireq->aborted = true;
++		goto put_and_exit;
++	}
++#endif
+ 	if (__inet_inherit_port(sk, newsk) < 0)
+ 		goto put_and_exit;
+ 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
+diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
+index 0ecc7311d..7fd712ed5 100644
+--- a/net/ipv4/tcp_minisocks.c
++++ b/net/ipv4/tcp_minisocks.c
+@@ -867,7 +867,12 @@ listen_overflow:
+ 	if (sk != req->rsk_listener)
+ 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++	if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow)
++	    && !inet_rsk(req)->aborted) {
++#else
+ 	if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow)) {
++#endif
+ 		inet_rsk(req)->acked = 1;
+ 		return NULL;
+ 	}
+diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
+index e3167ad96..dcc382ae6 100644
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -39,6 +39,9 @@
+ 
+ #include <net/tcp.h>
+ #include <net/mptcp.h>
++#ifdef CONFIG_SECURITY_TEMPESTA
++#include <net/tls.h>
++#endif
+ 
+ #include <linux/compiler.h>
+ #include <linux/gfp.h>
+@@ -396,7 +399,7 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
+ /* Constructs common control bits of non-data skb. If SYN/FIN is present,
+  * auto increment end seqno.
+  */
+-static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
++void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
+ {
+ 	skb->ip_summed = CHECKSUM_PARTIAL;
+ 
+@@ -409,6 +412,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
+ 		seq++;
+ 	TCP_SKB_CB(skb)->end_seq = seq;
+ }
++EXPORT_SYMBOL(tcp_init_nondata_skb);
+ 
+ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
+ {
+@@ -1486,7 +1490,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
+  * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
+  * otherwise socket can stall.
+  */
+-static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
++void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 
+@@ -1497,9 +1501,10 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
+ 	sk_wmem_queued_add(sk, skb->truesize);
+ 	sk_mem_charge(sk, skb->truesize);
+ }
++EXPORT_SYMBOL(tcp_queue_skb);
+ 
+ /* Initialize TSO segments for a packet. */
+-static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
++void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
+ {
+ 	if (skb->len <= mss_now) {
+ 		/* Avoid the costly divide in the normal
+@@ -1512,11 +1517,12 @@ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
+ 		TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
+ 	}
+ }
++EXPORT_SYMBOL(tcp_set_skb_tso_segs);
+ 
+ /* Pcount in the middle of the write queue got changed, we need to do various
+  * tweaks to fix counters
+  */
+-static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
++void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 
+@@ -1540,6 +1546,7 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
+ 
+ 	tcp_verify_left_out(tp);
+ }
++EXPORT_SYMBOL(tcp_adjust_pcount);
+ 
+ static bool tcp_has_tx_tstamp(const struct sk_buff *skb)
+ {
+@@ -1547,7 +1554,7 @@ static bool tcp_has_tx_tstamp(const struct sk_buff *skb)
+ 		(skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP);
+ }
+ 
+-static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
++void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
+ {
+ 	struct skb_shared_info *shinfo = skb_shinfo(skb);
+ 
+@@ -1563,12 +1570,14 @@ static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
+ 		TCP_SKB_CB(skb)->txstamp_ack = 0;
+ 	}
+ }
++EXPORT_SYMBOL(tcp_fragment_tstamp);
+ 
+-static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
++void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
+ {
+ 	TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor;
+ 	TCP_SKB_CB(skb)->eor = 0;
+ }
++EXPORT_SYMBOL(tcp_skb_fragment_eor);
+ 
+ /* Insert buff after skb on the write or rtx queue of sk.  */
+ static void tcp_insert_write_queue_after(struct sk_buff *skb,
+@@ -1576,12 +1585,39 @@ static void tcp_insert_write_queue_after(struct sk_buff *skb,
+ 					 struct sock *sk,
+ 					 enum tcp_queue tcp_queue)
+ {
++#ifdef CONFIG_SECURITY_TEMPESTA
++	skb_copy_tfw_cb(buff, skb);
++#endif
+ 	if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE)
+ 		__skb_queue_after(&sk->sk_write_queue, skb, buff);
+ 	else
+ 		tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
+ }
+ 
++/**
++ * Tempesta uses page fragments for all skb allocations, so if an skb was
++ * allocated in standard Linux way, then pskb_expand_head( , 0, 0, ) may
++ * return larger skb and we have to adjust skb->truesize and memory accounting
++ * for TCP write queue.
++ */
++static int
++tcp_skb_unclone(struct sock *sk, struct sk_buff *skb, gfp_t pri)
++{
++	int r, delta_truesize = skb->truesize;
++
++	if ((r = skb_unclone(skb, pri)))
++		return r;
++
++	delta_truesize -= skb->truesize;
++	sk->sk_wmem_queued -= delta_truesize;
++	if (delta_truesize > 0)
++		sk_mem_uncharge(sk, delta_truesize);
++	else
++		sk_mem_charge(sk, -delta_truesize);
++
++	return 0;
++}
++
+ /* Function to create two new TCP segments.  Shrinks the given segment
+  * to the specified size and appends a new segment with the rest of the
+  * packet to the list.  This won't be called frequently, I hope.
+@@ -1617,7 +1653,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+ 		return -ENOMEM;
+ 	}
+ 
+-	if (skb_unclone_keeptruesize(skb, gfp))
++	if (tcp_skb_unclone(sk, skb, gfp))
+ 		return -ENOMEM;
+ 
+ 	/* Get a new skb... force flag on. */
+@@ -1632,6 +1668,9 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+ 	nlen = skb->len - len;
+ 	buff->truesize += nlen;
+ 	skb->truesize -= nlen;
++#ifdef CONFIG_SECURITY_TEMPESTA
++	buff->mark = skb->mark;
++#endif
+ 
+ 	/* Correct the sequence numbers. */
+ 	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
+@@ -1719,7 +1758,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
+ {
+ 	u32 delta_truesize;
+ 
+-	if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
++	if (tcp_skb_unclone(sk, skb, GFP_ATOMIC))
+ 		return -ENOMEM;
+ 
+ 	delta_truesize = __pskb_trim_head(skb, len);
+@@ -1879,6 +1918,7 @@ unsigned int tcp_current_mss(struct sock *sk)
+ 
+ 	return mss_now;
+ }
++EXPORT_SYMBOL(tcp_current_mss);
+ 
+ /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
+  * As additional protections, we do not touch cwnd in retransmission phases,
+@@ -2153,8 +2193,8 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
+  * know that all the data is in scatter-gather pages, and that the
+  * packet has never been sent out before (and thus is not cloned).
+  */
+-static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
+-			unsigned int mss_now, gfp_t gfp)
++int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
++		 unsigned int mss_now, gfp_t gfp)
+ {
+ 	int nlen = skb->len - len;
+ 	struct sk_buff *buff;
+@@ -2173,6 +2213,9 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
+ 	sk_mem_charge(sk, buff->truesize);
+ 	buff->truesize += nlen;
+ 	skb->truesize -= nlen;
++#ifdef CONFIG_SECURITY_TEMPESTA
++	buff->mark = skb->mark;
++#endif
+ 
+ 	/* Correct the sequence numbers. */
+ 	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
+@@ -2199,6 +2242,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
+ 
+ 	return 0;
+ }
++EXPORT_SYMBOL(tso_fragment);
+ 
+ /* Try to defer sending, if possible, in order to minimize the amount
+  * of TSO splitting we do.  View it as a kind of TSO Nagle test.
+@@ -2345,6 +2389,14 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
+ 		    tcp_has_tx_tstamp(skb) ||
+ 		    !skb_pure_zcopy_same(skb, next))
+ 			return false;
++#ifdef CONFIG_SECURITY_TEMPESTA
++		/* Do not coalesce tempesta skbs with tls type or set mark. */
++		if ((next != ((struct sk_buff *)&(sk)->sk_write_queue))
++		    && ((skb_tfw_tls_type(skb) != skb_tfw_tls_type(next))
++			|| (sock_flag(sk, SOCK_TEMPESTA)
++			    && (skb->mark != next->mark))))
++			return false;
++#endif
+ 
+ 		len -= skb->len;
+ 	}
+@@ -2683,6 +2735,78 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
+ 		tcp_chrono_set(tp, TCP_CHRONO_BUSY);
+ }
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++
++/**
++ * The next two functions are called from places: from `tcp_write_xmit`
++ * (a usual case) and from `tcp_write_wakeup`. In other places where
++ * `tcp_transmit_skb` is called we deal with special TCP skbs or skbs
++ * not from tcp send queue.
++ */
++static int
++tcp_tfw_sk_prepare_xmit(struct sock *sk, struct sk_buff *skb,
++			unsigned int mss_now, unsigned int *limit,
++			unsigned int *nskbs)
++{
++	if (!sk->sk_prepare_xmit || !skb_tfw_tls_type(skb))
++		return 0;
++
++	if (unlikely(*limit <= TLS_MAX_OVERHEAD)) {
++		net_warn_ratelimited("%s: too small MSS %u"
++				     " for TLS\n",
++				     __func__, mss_now);
++		return -ENOMEM;
++	}
++
++	if (*limit > TLS_MAX_PAYLOAD_SIZE + TLS_MAX_OVERHEAD)
++		*limit = TLS_MAX_PAYLOAD_SIZE;
++	else
++		*limit -= TLS_MAX_OVERHEAD;
++
++	if (unlikely(skb_tfw_flags(skb) & SS_F_HTTP2_FRAME_PREPARED)) {
++		*nskbs = 1;
++		return 0;
++	}
++
++	return sk->sk_prepare_xmit(sk, skb, mss_now, limit, nskbs);
++}
++
++static int
++tcp_tfw_sk_write_xmit(struct sock *sk, struct sk_buff *skb,
++		      unsigned int mss_now, unsigned int limit,
++		      unsigned int nskbs)
++{
++	int result;
++
++	if (!sk->sk_write_xmit || !skb_tfw_tls_type(skb))
++		return 0;
++
++	result = sk->sk_write_xmit(sk, skb, mss_now, limit, nskbs);
++	if (unlikely(result))
++		return result;
++
++	/* Fix up TSO segments after TLS overhead. */
++	tcp_set_skb_tso_segs(skb, mss_now);
++	return 0;
++}
++
++/**
++ * This function is similar to `tcp_write_err` except that we send
++ * TCP RST to remote peer.  We call this function when an error occurs
++ * while sending data from which we cannot recover, so we close the
++ * connection with TCP RST.
++ */
++static void
++tcp_tfw_handle_error(struct sock *sk, int error)
++{
++	tcp_send_active_reset(sk, GFP_ATOMIC);
++	sk->sk_err = error;
++	sk->sk_error_report(sk);
++	tcp_write_queue_purge(sk);
++	tcp_done(sk);
++}
++#endif
++
+ /* This routine writes packets to the network.  It advances the
+  * send_head.  This happens as incoming acks open up the remote
+  * window for us.
+@@ -2707,6 +2831,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+ 	int result;
+ 	bool is_cwnd_limited = false, is_rwnd_limited = false;
+ 	u32 max_segs;
++#ifdef CONFIG_SECURITY_TEMPESTA
++	unsigned int nskbs = UINT_MAX;
++#endif
+ 
+ 	sent_pkts = 0;
+ 
+@@ -2773,7 +2900,16 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+ 							  cwnd_quota,
+ 							  max_segs),
+ 						    nonagle);
+-
++#ifdef CONFIG_SECURITY_TEMPESTA
++		result = tcp_tfw_sk_prepare_xmit(sk, skb, mss_now, &limit,
++						 &nskbs);
++		if (unlikely(result)) {
++			if (result == -ENOMEM)
++				break; /* try again next time */
++			tcp_tfw_handle_error(sk, result);
++			return false;
++		}
++#endif
+ 		if (skb->len > limit &&
+ 		    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
+ 			break;
+@@ -2788,7 +2924,15 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+ 		 */
+ 		if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)
+ 			break;
+-
++#ifdef CONFIG_SECURITY_TEMPESTA
++		result = tcp_tfw_sk_write_xmit(sk, skb, mss_now, limit, nskbs);
++		if (unlikely(result)) {
++			if (result == -ENOMEM)
++				break; /* try again next time */
++			tcp_tfw_handle_error(sk, result);
++			return false;
++		}
++#endif
+ 		if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
+ 			break;
+ 
+@@ -2978,6 +3122,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
+ 			   sk_gfp_mask(sk, GFP_ATOMIC)))
+ 		tcp_check_probe_timer(sk);
+ }
++EXPORT_SYMBOL(__tcp_push_pending_frames);
+ 
+ /* Send _single_ skb sitting at the send head. This function requires
+  * true push pending frames to setup probe timer etc.
+@@ -3336,7 +3481,7 @@ start:
+ 				 cur_mss, GFP_ATOMIC))
+ 			return -ENOMEM; /* We'll try again later. */
+ 	} else {
+-		if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
++		if (tcp_skb_unclone(sk, skb, GFP_ATOMIC))
+ 			return -ENOMEM;
+ 
+ 		diff = tcp_skb_pcount(skb);
+@@ -3577,6 +3722,7 @@ void tcp_send_fin(struct sock *sk)
+ 	}
+ 	__tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
+ }
++EXPORT_SYMBOL(tcp_send_fin);
+ 
+ /* We get here when a process closes a file descriptor (either due to
+  * an explicit close() or as a byproduct of exit()'ing) and there
+@@ -3610,6 +3756,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
+ 	 */
+ 	trace_tcp_send_reset(sk, NULL);
+ }
++EXPORT_SYMBOL(tcp_send_active_reset);
+ 
+ /* Send a crossed SYN-ACK during socket establishment.
+  * WARNING: This routine must only be called when we have already sent
+@@ -4292,6 +4439,9 @@ int tcp_write_wakeup(struct sock *sk, int mib)
+ 
+ 	skb = tcp_send_head(sk);
+ 	if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
++#ifdef CONFIG_SECURITY_TEMPESTA
++		unsigned int nskbs = UINT_MAX;
++#endif
+ 		int err;
+ 		unsigned int mss = tcp_current_mss(sk);
+ 		unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
+@@ -4299,6 +4449,15 @@ int tcp_write_wakeup(struct sock *sk, int mib)
+ 		if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
+ 			tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++		err = tcp_tfw_sk_prepare_xmit(sk, skb, mss, &seg_size, &nskbs);
++		if (unlikely(err)) {
++			if (err != -ENOMEM)
++				tcp_tfw_handle_error(sk, err);
++			return err;
++		}
++#endif
++
+ 		/* We are probing the opening of a window
+ 		 * but the window size is != 0
+ 		 * must have been a result SWS avoidance ( sender )
+@@ -4314,6 +4473,16 @@ int tcp_write_wakeup(struct sock *sk, int mib)
+ 			tcp_set_skb_tso_segs(skb, mss);
+ 
+ 		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
++
++#ifdef CONFIG_SECURITY_TEMPESTA
++		err = tcp_tfw_sk_write_xmit(sk, skb, mss, seg_size, nskbs);
++		if (unlikely(err)) {
++			if (err != -ENOMEM)
++				tcp_tfw_handle_error(sk, err);
++			return err;
++		}
++#endif
++
+ 		err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
+ 		if (!err)
+ 			tcp_event_new_data_sent(sk, skb);
+diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
+index 57b25b1fc..5fcfe7199 100644
+--- a/net/ipv6/tcp_ipv6.c
++++ b/net/ipv6/tcp_ipv6.c
+@@ -65,6 +65,7 @@
+ 
+ #include <crypto/hash.h>
+ #include <linux/scatterlist.h>
++#include <linux/tempesta.h>
+ 
+ #include <trace/events/tcp.h>
+ 
+@@ -563,7 +564,11 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
+ 
+ 		rcu_read_lock();
+ 		opt = ireq->ipv6_opt;
++#ifdef CONFIG_SECURITY_TEMPESTA
++		if (!sock_flag(sk, SOCK_TEMPESTA) && !opt)
++#else
+ 		if (!opt)
++#endif
+ 			opt = rcu_dereference(np->opt);
+ 		err = ip6_xmit(sk, skb, fl6, skb->mark ? : READ_ONCE(sk->sk_mark),
+ 			       opt, tclass, READ_ONCE(sk->sk_priority));
+@@ -1488,7 +1493,11 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
+ 	   to newsk.
+ 	 */
+ 	opt = ireq->ipv6_opt;
++#ifdef CONFIG_SECURITY_TEMPESTA
++	if (!sock_flag(sk, SOCK_TEMPESTA) && !opt)
++#else
+ 	if (!opt)
++#endif
+ 		opt = rcu_dereference(np->opt);
+ 	if (opt) {
+ 		opt = ipv6_dup_options(newsk, opt);
+@@ -1532,7 +1541,20 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
+ 	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET6))
+ 		goto out; /* OOM */
+ #endif
+-
++#ifdef CONFIG_SECURITY_TEMPESTA
++	/*
++	 * We need already initialized socket addresses,
++	 * so there is no appropriate security hook.
++	 */
++	if (tempesta_new_clntsk(newsk, skb)) {
++		tcp_v6_send_reset(newsk, skb);
++		tempesta_close_clntsk(newsk);
++		ireq->aborted = true;
++		inet_csk_prepare_forced_close(newsk);
++		tcp_done(newsk);
++		goto out;
++	}
++#endif
+ 	if (__inet_inherit_port(sk, newsk) < 0) {
+ 		inet_csk_prepare_forced_close(newsk);
+ 		tcp_done(newsk);
+diff --git a/net/socket.c b/net/socket.c
+index ed3df2f74..abf94e5e8 100644
+--- a/net/socket.c
++++ b/net/socket.c
+@@ -227,6 +227,12 @@ static const char * const pf_family_names[] = {
+ static DEFINE_SPINLOCK(net_family_lock);
+ static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly;
+ 
++const struct net_proto_family *get_proto_family(int family)
++{
++	return rcu_dereference_bh(net_families[family]);
++}
++EXPORT_SYMBOL(get_proto_family);
++
+ /*
+  * Support routines.
+  * Move socket addresses back and forth across the kernel/user
+diff --git a/security/Kconfig b/security/Kconfig
+index 52c9af08a..8cdb093d7 100644
+--- a/security/Kconfig
++++ b/security/Kconfig
+@@ -193,12 +193,14 @@ source "security/loadpin/Kconfig"
+ source "security/yama/Kconfig"
+ source "security/safesetid/Kconfig"
+ source "security/lockdown/Kconfig"
++source "security/tempesta/Kconfig"
+ source "security/landlock/Kconfig"
+ 
+ source "security/integrity/Kconfig"
+ 
+ choice
+ 	prompt "First legacy 'major LSM' to be initialized"
++	default DEFAULT_SECURITY_TEMPESTA if SECURITY_TEMPESTA
+ 	default DEFAULT_SECURITY_SELINUX if SECURITY_SELINUX
+ 	default DEFAULT_SECURITY_SMACK if SECURITY_SMACK
+ 	default DEFAULT_SECURITY_TOMOYO if SECURITY_TOMOYO
+@@ -214,6 +216,9 @@ choice
+ 	  Selects the legacy "major security module" that will be
+ 	  initialized first. Overridden by non-default CONFIG_LSM.
+ 
++	config DEFAULT_SECURITY_TEMPESTA
++		bool "Tempesta FW" if SECURITY_TEMPESTA=y
++
+ 	config DEFAULT_SECURITY_SELINUX
+ 		bool "SELinux" if SECURITY_SELINUX=y
+ 
+@@ -238,6 +243,7 @@ config LSM
+ 	default "landlock,lockdown,yama,loadpin,safesetid,tomoyo,bpf" if DEFAULT_SECURITY_TOMOYO
+ 	default "landlock,lockdown,yama,loadpin,safesetid,bpf" if DEFAULT_SECURITY_DAC
+ 	default "landlock,lockdown,yama,loadpin,safesetid,selinux,smack,tomoyo,apparmor,bpf"
++	default "tempesta,landlock,lockdown,yama,loadpin,safesetid,selinux,smack,tomoyo,apparmor,bpf"
+ 	help
+ 	  A comma-separated list of LSMs, in initialization order.
+ 	  Any LSMs left off this list, except for those with order
+diff --git a/security/Makefile b/security/Makefile
+index 59f238490..9ccefb552 100644
+--- a/security/Makefile
++++ b/security/Makefile
+@@ -24,6 +24,7 @@ obj-$(CONFIG_SECURITY_SAFESETID)       += safesetid/
+ obj-$(CONFIG_SECURITY_LOCKDOWN_LSM)	+= lockdown/
+ obj-$(CONFIG_CGROUPS)			+= device_cgroup.o
+ obj-$(CONFIG_BPF_LSM)			+= bpf/
++obj-$(CONFIG_SECURITY_TEMPESTA)		+= tempesta/
+ obj-$(CONFIG_SECURITY_LANDLOCK)		+= landlock/
+ 
+ # Object integrity file lists
+diff --git a/security/security.c b/security/security.c
+index a344b8fa5..126a7fdf4 100644
+--- a/security/security.c
++++ b/security/security.c
+@@ -31,6 +31,7 @@
+ #include <linux/msg.h>
+ #include <linux/overflow.h>
+ #include <net/flow.h>
++#include <net/sock.h>
+ 
+ /* How many LSMs were built into the kernel? */
+ #define LSM_COUNT (__end_lsm_info - __start_lsm_info)
+@@ -4695,6 +4696,8 @@ EXPORT_SYMBOL(security_socket_getpeersec_dgram);
+  */
+ int security_sk_alloc(struct sock *sk, int family, gfp_t priority)
+ {
++	sk->sk_security = NULL;
++
+ 	return call_int_hook(sk_alloc_security, 0, sk, family, priority);
+ }
+ 
+diff --git a/security/tempesta/Kconfig b/security/tempesta/Kconfig
+new file mode 100644
+index 000000000..f6be0927a
+--- /dev/null
++++ b/security/tempesta/Kconfig
+@@ -0,0 +1,16 @@
++config SECURITY_TEMPESTA
++	bool "Tempesta FW Support"
++	depends on SECURITY && NET && INET
++	select SECURITY_NETWORK
++	select RPS
++	select CRYPTO
++	select CRYPTO_HMAC
++	select CRYPTO_SHA1
++	select CRYPTO_SHA1_SSSE3
++	select CRYPTO_GCM
++	select CRYPTO_CCM
++	default y
++	help
++	  This selects Tempesta FW security module.
++	  Further information may be found at https://github.com/natsys/tempesta
++	  If you are unsure how to answer this question, answer N.
+diff --git a/security/tempesta/Makefile b/security/tempesta/Makefile
+new file mode 100644
+index 000000000..4c439ac0c
+--- /dev/null
++++ b/security/tempesta/Makefile
+@@ -0,0 +1,3 @@
++obj-y := tempesta.o
++
++tempesta-y := tempesta_lsm.o
+diff --git a/security/tempesta/tempesta_lsm.c b/security/tempesta/tempesta_lsm.c
+new file mode 100644
+index 000000000..1303d8273
+--- /dev/null
++++ b/security/tempesta/tempesta_lsm.c
+@@ -0,0 +1,140 @@
++/**
++ *		Tempesta FW
++ *
++ * Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com).
++ * Copyright (C) 2015-2023 Tempesta Technologies, Inc.
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License,
++ * or (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful, but WITHOUT
++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++ * FOR A PARTICULAR PURPOSE.
++ * See the GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License along with
++ * this program; if not, write to the Free Software Foundation, Inc., 59
++ * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
++ */
++#include <linux/ipv6.h>
++#include <linux/lsm_hooks.h>
++#include <linux/spinlock.h>
++#include <linux/tempesta.h>
++
++static TempestaOps __rcu *tempesta_ops = NULL;
++static DEFINE_SPINLOCK(tops_lock);
++
++void
++tempesta_register_ops(TempestaOps *tops)
++{
++	spin_lock(&tops_lock);
++
++	BUG_ON(tempesta_ops);
++
++	rcu_assign_pointer(tempesta_ops, tops);
++
++	spin_unlock(&tops_lock);
++}
++EXPORT_SYMBOL(tempesta_register_ops);
++
++void
++tempesta_unregister_ops(TempestaOps *tops)
++{
++	spin_lock(&tops_lock);
++
++	BUG_ON(tempesta_ops != tops);
++
++	rcu_assign_pointer(tempesta_ops, NULL);
++
++	spin_unlock(&tops_lock);
++
++	/*
++	 * tempesta_ops is called in softirq only, so if there are some users
++	 * of the structures then they are active on their CPUs.
++	 * After the below we can be sure that nobody refers @tops and we can
++	 * go forward and destroy it.
++	 */
++	synchronize_rcu();
++}
++EXPORT_SYMBOL(tempesta_unregister_ops);
++
++int
++tempesta_new_clntsk(struct sock *newsk, struct sk_buff *skb)
++{
++	int r = 0;
++
++	TempestaOps *tops;
++
++	WARN_ON(newsk->sk_security);
++
++	rcu_read_lock();
++
++	tops = rcu_dereference(tempesta_ops);
++	if (likely(tops))
++		r = tops->sk_alloc(newsk, skb);
++
++	rcu_read_unlock();
++
++	return r;
++}
++EXPORT_SYMBOL(tempesta_new_clntsk);
++
++void
++tempesta_close_clntsk(struct sock *sk)
++{
++	TempestaOps *tops;
++
++	rcu_read_lock();
++
++	tops = rcu_dereference(tempesta_ops);
++	if (likely(tops))
++		tops->sk_free(sk);
++
++	rcu_read_unlock();
++}
++EXPORT_SYMBOL(tempesta_close_clntsk);
++
++static int
++tempesta_sock_tcp_rcv(struct sock *sk, struct sk_buff *skb)
++{
++	int r = 0;
++	TempestaOps *tops;
++
++	rcu_read_lock();
++
++	tops = rcu_dereference(tempesta_ops);
++	if (likely(tops)) {
++		if (skb->protocol == htons(ETH_P_IP))
++			r = tops->sock_tcp_rcv(sk, skb);
++	}
++
++	rcu_read_unlock();
++
++	return r;
++}
++
++static struct security_hook_list tempesta_hooks[] __read_mostly = {
++	LSM_HOOK_INIT(socket_sock_rcv_skb, tempesta_sock_tcp_rcv),
++};
++
++static const struct lsm_id tempesta_lsmid = {
++	.name = "tempesta",
++	.id = LSM_ID_BPF,
++};
++
++static __init int
++tempesta_init(void)
++{
++	security_add_hooks(tempesta_hooks, ARRAY_SIZE(tempesta_hooks),
++			   &tempesta_lsmid);
++
++	return 0;
++}
++
++DEFINE_LSM(smack) = {
++	.name = "tempesta",
++	.flags = LSM_FLAG_LEGACY_MAJOR | LSM_FLAG_EXCLUSIVE,
++	.init = tempesta_init,
++};

From 99bb027c5994224a7e99d3f4ab674bc000eafa6a Mon Sep 17 00:00:00 2001
From: kingluo <home_king@163.com>
Date: Wed, 26 Jun 2024 16:16:07 +0800
Subject: [PATCH 16/25] patch update: change for_each_possible_cpu to
 for_each_online_cpu

---
 linux-6.8.9.patch | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/linux-6.8.9.patch b/linux-6.8.9.patch
index dcf8c282b..a339f9bc1 100644
--- a/linux-6.8.9.patch
+++ b/linux-6.8.9.patch
@@ -748,7 +748,7 @@ index 000000000..90eedcba5
 + * Linux interface for Tempesta FW.
 + *
 + * Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com).
-+ * Copyright (C) 2015-2023 Tempesta Technologies, Inc.
++ * Copyright (C) 2015-2024 Tempesta Technologies, Inc.
 + *
 + * This program is free software; you can redistribute it and/or modify it
 + * under the terms of the GNU General Public License as published by
@@ -1065,7 +1065,7 @@ index 000000000..7ee3ead54
 +/**
 + *		Tempesta Memory Reservation
 + *
-+ * Copyright (C) 2015-2022 Tempesta Technologies, Inc.
++ * Copyright (C) 2015-2024 Tempesta Technologies, Inc.
 + *
 + * This program is free software; you can redistribute it and/or modify it
 + * under the terms of the GNU General Public License as published by
@@ -1410,7 +1410,7 @@ index 63de5c635..76e222130 100644
  }
 +EXPORT_SYMBOL(reqsk_fastopen_remove);
 diff --git a/net/core/skbuff.c b/net/core/skbuff.c
-index 71dee435d..c6b29cfc3 100644
+index 71dee435d..fe4b6828c 100644
 --- a/net/core/skbuff.c
 +++ b/net/core/skbuff.c
 @@ -89,7 +89,9 @@
@@ -1919,7 +1919,7 @@ index 71dee435d..c6b29cfc3 100644
  {
 +#ifdef CONFIG_SECURITY_TEMPESTA
 +	int cpu, l;
-+	for_each_possible_cpu(cpu)
++	for_each_online_cpu(cpu)
 +		for (l = 0; l < PG_LISTS_N; ++l) {
 +			TfwSkbMemPool *pool = per_cpu_ptr(&pg_mpool[l], cpu);
 +			INIT_LIST_HEAD(&pool->lh);
@@ -3037,7 +3037,7 @@ index 000000000..1303d8273
 + *		Tempesta FW
 + *
 + * Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com).
-+ * Copyright (C) 2015-2023 Tempesta Technologies, Inc.
++ * Copyright (C) 2015-2024 Tempesta Technologies, Inc.
 + *
 + * This program is free software; you can redistribute it and/or modify it
 + * under the terms of the GNU General Public License as published by

From c1c068b65831190944ebb96f0010b0d7611a075a Mon Sep 17 00:00:00 2001
From: kingluo <home_king@163.com>
Date: Thu, 4 Jul 2024 14:58:44 +0800
Subject: [PATCH 17/25] handle SKBFL_SHARED_FRAG in flags, not tx_flags

---
 fw/cache.c    |  6 +++---
 fw/http.c     |  4 ++--
 fw/http_msg.c | 12 ++++++------
 fw/http_msg.h |  4 ++--
 fw/msg.c      |  8 ++++----
 fw/msg.h      |  4 ++--
 fw/ss_skb.c   | 16 ++++++++--------
 fw/ss_skb.h   |  4 ++--
 fw/tcp.c      |  4 ++--
 9 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/fw/cache.c b/fw/cache.c
index 561efcc67..b6ea82441 100644
--- a/fw/cache.c
+++ b/fw/cache.c
@@ -2638,14 +2638,14 @@ tfw_cache_build_resp_body(TDB *db, TdbVRec *trec, TfwMsgIter *it, char *p,
 	 * TX flags for headers and body differ.
 	 */
 	if (!it->skb || (it->frag + 1 >= MAX_SKB_FRAGS)
-	    || (sh_frag == !(skb_shinfo(it->skb)->tx_flags & SKBFL_SHARED_FRAG)))
+	    || (sh_frag == !(skb_shinfo(it->skb)->flags & SKBFL_SHARED_FRAG)))
 	{
 		if  ((r = tfw_msg_iter_append_skb(it)))
 			return r;
 		if (!sh_frag)
-			skb_shinfo(it->skb)->tx_flags &= ~SKBFL_SHARED_FRAG;
+			skb_shinfo(it->skb)->flags &= ~SKBFL_SHARED_FRAG;
 		else
-			skb_shinfo(it->skb)->tx_flags |= SKBFL_SHARED_FRAG;
+			skb_shinfo(it->skb)->flags |= SKBFL_SHARED_FRAG;
 	}
 
 	while (1) {
diff --git a/fw/http.c b/fw/http.c
index fd9599328..8f0e138cb 100644
--- a/fw/http.c
+++ b/fw/http.c
@@ -4816,11 +4816,11 @@ tfw_h2_append_predefined_body(TfwHttpResp *resp, unsigned int stream_id,
 	it->frag = skb_shinfo(it->skb)->nr_frags - 1;
 
 	if ((it->frag + 1 >= MAX_SKB_FRAGS)
-	    || (skb_shinfo(it->skb)->tx_flags & SKBFL_SHARED_FRAG))
+	    || (skb_shinfo(it->skb)->flags & SKBFL_SHARED_FRAG))
 	{
 		if  ((r = tfw_msg_iter_append_skb(it)))
 			return r;
-		skb_shinfo(it->skb)->tx_flags &= ~SKBFL_SHARED_FRAG;
+		skb_shinfo(it->skb)->flags &= ~SKBFL_SHARED_FRAG;
 	}
 
 	data = body->data;
diff --git a/fw/http_msg.c b/fw/http_msg.c
index e9a119e43..aea94d5ec 100644
--- a/fw/http_msg.c
+++ b/fw/http_msg.c
@@ -4,7 +4,7 @@
  * HTTP message manipulation helpers for the protocol processing.
  *
  * Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com).
- * Copyright (C) 2015-2023 Tempesta Technologies, Inc.
+ * Copyright (C) 2015-2024 Tempesta Technologies, Inc.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by
@@ -1113,11 +1113,11 @@ tfw_http_msg_hdr_add(TfwHttpMsg *hm, const TfwStr *hdr)
  */
 int
 tfw_http_msg_setup(TfwHttpMsg *hm, TfwMsgIter *it, size_t data_len,
-		   unsigned int tx_flags)
+		   unsigned int flags)
 {
 	int r;
 
-	if ((r = tfw_msg_iter_setup(it, &hm->msg.skb_head, data_len, tx_flags)))
+	if ((r = tfw_msg_iter_setup(it, &hm->msg.skb_head, data_len, flags)))
 		return r;
 	T_DBG2("Set up HTTP message %pK with %lu bytes data\n", hm, data_len);
 
@@ -1487,7 +1487,7 @@ __tfw_http_msg_linear_transform(TfwMsgIter *it)
 		if (!nskb)
 			return -ENOMEM;
 
-		skb_shinfo(nskb)->tx_flags = skb_shinfo(it->skb)->tx_flags;
+		skb_shinfo(nskb)->flags = skb_shinfo(it->skb)->flags;
 		ss_skb_insert_before(&it->skb_head, it->skb, nskb);
 		it->skb = nskb;
 		it->frag = -1;
@@ -1565,8 +1565,8 @@ __tfw_http_msg_expand_from_pool(TfwHttpResp *resp, const TfwStr *str,
 					}
 				}
 
-				skb_shinfo(nskb)->tx_flags =
-					skb_shinfo(it->skb)->tx_flags;
+				skb_shinfo(nskb)->flags =
+					skb_shinfo(it->skb)->flags;
 				ss_skb_insert_after(it->skb, nskb);
 				/*
 				 * If body is located in the zero fragment and
diff --git a/fw/http_msg.h b/fw/http_msg.h
index 1fcb02883..619d7dd82 100644
--- a/fw/http_msg.h
+++ b/fw/http_msg.h
@@ -2,7 +2,7 @@
  *		Tempesta FW
  *
  * Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com).
- * Copyright (C) 2015-2023 Tempesta Technologies, Inc.
+ * Copyright (C) 2015-2024 Tempesta Technologies, Inc.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by
@@ -162,7 +162,7 @@ int tfw_http_msg_del_hbh_hdrs(TfwHttpMsg *hm);
 int tfw_http_msg_cutoff_body_chunks(TfwHttpResp *resp);
 
 int tfw_http_msg_setup(TfwHttpMsg *hm, TfwMsgIter *it, size_t data_len,
-		       unsigned int tx_flags);
+		       unsigned int flags);
 int tfw_http_msg_add_data(TfwMsgIter *it, TfwHttpMsg *hm, TfwStr *field,
 			  const TfwStr *data);
 void tfw_http_msg_hdr_open(TfwHttpMsg *hm, unsigned char *hdr_start);
diff --git a/fw/msg.c b/fw/msg.c
index 90f035dab..061d51163 100644
--- a/fw/msg.c
+++ b/fw/msg.c
@@ -1,7 +1,7 @@
 /**
  *		Tempesta FW
  *
- * Copyright (C) 2018-2023 Tempesta Technologies, Inc.
+ * Copyright (C) 2018-2024 Tempesta Technologies, Inc.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by
@@ -47,11 +47,11 @@ EXPORT_SYMBOL(tfw_msg_write);
  */
 int
 tfw_msg_iter_setup(TfwMsgIter *it, struct sk_buff **skb_head, size_t data_len,
-		   unsigned int tx_flags)
+		   unsigned int flags)
 {
 	int r;
 
-	if ((r = ss_skb_alloc_data(skb_head, data_len, tx_flags)))
+	if ((r = ss_skb_alloc_data(skb_head, data_len, flags)))
 		return r;
 	it->skb = it->skb_head = *skb_head;
 	it->frag = -1;
@@ -76,7 +76,7 @@ tfw_msg_iter_append_skb(TfwMsgIter *it)
 	it->skb = ss_skb_peek_tail(&it->skb_head);
 	it->frag = -1;
 
-	skb_shinfo(it->skb)->tx_flags = skb_shinfo(it->skb->prev)->tx_flags;
+	skb_shinfo(it->skb)->flags = skb_shinfo(it->skb->prev)->flags;
 
 	return 0;
 }
diff --git a/fw/msg.h b/fw/msg.h
index bb604456d..da008675a 100644
--- a/fw/msg.h
+++ b/fw/msg.h
@@ -4,7 +4,7 @@
  * Generic protocol message.
  *
  * Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com).
- * Copyright (C) 2015-2023 Tempesta Technologies, Inc.
+ * Copyright (C) 2015-2024 Tempesta Technologies, Inc.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by
@@ -97,7 +97,7 @@ typedef struct {
 
 int tfw_msg_write(TfwMsgIter *it, const TfwStr *data);
 int tfw_msg_iter_setup(TfwMsgIter *it, struct sk_buff **skb_head,
-		       size_t data_len, unsigned int tx_flags);
+		       size_t data_len, unsigned int flags);
 int tfw_msg_iter_append_skb(TfwMsgIter *it);
 int tfw_http_iter_set_at(TfwMsgIter *it, char *off);
 char *tfw_http_iter_set_at_skb(TfwMsgIter *it, struct sk_buff *skb,
diff --git a/fw/ss_skb.c b/fw/ss_skb.c
index 2ef58d0fe..24a3c5f84 100644
--- a/fw/ss_skb.c
+++ b/fw/ss_skb.c
@@ -7,7 +7,7 @@
  * on top on native Linux socket buffers. The helpers provide common and
  * convenient wrappers for skb processing.
  *
- * Copyright (C) 2015-2023 Tempesta Technologies, Inc.
+ * Copyright (C) 2015-2024 Tempesta Technologies, Inc.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by
@@ -109,7 +109,7 @@ ss_skb_alloc_pages(size_t len)
  * segmentation. The allocated payload space will be filled with data.
  */
 int
-ss_skb_alloc_data(struct sk_buff **skb_head, size_t len, unsigned int tx_flags)
+ss_skb_alloc_data(struct sk_buff **skb_head, size_t len, unsigned int flags)
 {
 	int i_skb, nr_skbs = len ? DIV_ROUND_UP(len, SS_SKB_MAX_DATA_LEN) : 1;
 	size_t n = 0;
@@ -120,7 +120,7 @@ ss_skb_alloc_data(struct sk_buff **skb_head, size_t len, unsigned int tx_flags)
 		skb = ss_skb_alloc_pages(n);
 		if (!skb)
 			return -ENOMEM;
-		skb_shinfo(skb)->tx_flags |= tx_flags;
+		skb_shinfo(skb)->flags |= flags;
 		ss_skb_queue_tail(skb_head, skb);
 	}
 
@@ -224,7 +224,7 @@ __extend_pgfrags(struct sk_buff *skb_head, struct sk_buff *skb, int from, int n)
 			nskb = ss_skb_alloc(0);
 			if (nskb == NULL)
 				return -ENOMEM;
-			skb_shinfo(nskb)->tx_flags = skb_shinfo(skb)->tx_flags;
+			skb_shinfo(nskb)->flags = skb_shinfo(skb)->flags;
 			ss_skb_insert_after(skb, nskb);
 			skb_shinfo(nskb)->nr_frags = n_excess;
 		}
@@ -1331,7 +1331,7 @@ ss_skb_init_for_xmit(struct sk_buff *skb)
 	skb->mac_header = (typeof(skb->mac_header))~0U;
 	skb->transport_header = (typeof(skb->transport_header))~0U;
 
-	shinfo->tx_flags = 0;
+	shinfo->flags = 0;
 	shinfo->gso_size = 0;
 	shinfo->gso_segs = 0;
 	shinfo->gso_type = 0;
@@ -1354,7 +1354,7 @@ __coalesce_frag(struct sk_buff **skb_head, skb_frag_t *frag,
 		skb = ss_skb_alloc(0);
 		if (!skb)
 			return -ENOMEM;
-		skb_shinfo(skb)->tx_flags = skb_shinfo(orig_skb)->tx_flags;
+		skb_shinfo(skb)->flags = skb_shinfo(orig_skb)->flags;
 		ss_skb_queue_tail(skb_head, skb);
 		skb->mark = orig_skb->mark;
 	}
@@ -1569,7 +1569,7 @@ ss_skb_to_sgvec_with_new_pages(struct sk_buff *skb, struct scatterlist *sgl,
 	int i;
 
 	/* TODO: process of SKBTX_ZEROCOPY_FRAG for MSG_ZEROCOPY */
-	if (skb_shinfo(skb)->tx_flags & SKBFL_SHARED_FRAG) {
+	if (skb_shinfo(skb)->flags & SKBFL_ALL_ZEROCOPY) {
 		if (head_data_len) {
 			sg_set_buf(sgl + out_frags, skb->data, head_data_len);
 			out_frags++;
@@ -1605,7 +1605,7 @@ ss_skb_to_sgvec_with_new_pages(struct sk_buff *skb, struct scatterlist *sgl,
 		}
 		if (out_frags > 0)
 			sg_mark_end(&sgl[out_frags - 1]);
-		skb_shinfo(skb)->tx_flags &= ~SKBFL_SHARED_FRAG;
+		skb_shinfo(skb)->flags &= ~SKBFL_SHARED_FRAG;
 	} else {
 		int r = skb_to_sgvec(skb, sgl + out_frags, 0, skb->len);
 		if (r <= 0)
diff --git a/fw/ss_skb.h b/fw/ss_skb.h
index cb07c90ef..bf2e3d3cf 100644
--- a/fw/ss_skb.h
+++ b/fw/ss_skb.h
@@ -3,7 +3,7 @@
  *
  * Synchronous Sockets API for Linux socket buffers manipulation.
  *
- * Copyright (C) 2015-2023 Tempesta Technologies, Inc.
+ * Copyright (C) 2015-2024 Tempesta Technologies, Inc.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by
@@ -299,7 +299,7 @@ ss_skb_move_frags(struct sk_buff *skb, struct sk_buff *nskb, int from,
 char *ss_skb_fmt_src_addr(const struct sk_buff *skb, char *out_buf);
 
 int ss_skb_alloc_data(struct sk_buff **skb_head, size_t len,
-		      unsigned int tx_flags);
+		      unsigned int flags);
 struct sk_buff *ss_skb_split(struct sk_buff *skb, int len);
 int ss_skb_get_room(struct sk_buff *skb_head, struct sk_buff *skb,
 		    char *pspt, unsigned int len, TfwStr *it);
diff --git a/fw/tcp.c b/fw/tcp.c
index 77bdd83bc..3a42376b4 100644
--- a/fw/tcp.c
+++ b/fw/tcp.c
@@ -1,7 +1,7 @@
 /**
  *		Tempesta FW
  *
- * Copyright (C) 2015-2023 Tempesta Technologies, Inc.
+ * Copyright (C) 2015-2024 Tempesta Technologies, Inc.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by
@@ -68,7 +68,7 @@ tfw_tcp_setup_new_skb(struct sock *sk, struct sk_buff *skb,
 	const bool tcp_fragment = skb->len != skb->data_len;
 
 	INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
-	skb_shinfo(nskb)->tx_flags = 0;
+	skb_shinfo(nskb)->flags = 0;
 	memset(TCP_SKB_CB(nskb), 0, sizeof(struct tcp_skb_cb));
 
 	/* PSH and FIN should only be set in the second packet. */

From 96628acb9da155dca75177915d0d633bf67560ca Mon Sep 17 00:00:00 2001
From: kingluo <home_king@163.com>
Date: Thu, 4 Jul 2024 19:17:07 +0800
Subject: [PATCH 18/25] merge master branch: 0dee0257

---
 Makefile                |    4 +-
 db/core/htrie.c         |    4 +-
 fw/cache.c              |  152 ++++--
 fw/cfg.h                |    9 +-
 fw/connection.h         |    6 +-
 fw/hpack.c              |  248 ++++-----
 fw/hpack.h              |   26 +-
 fw/http.c               |  263 +++++-----
 fw/http.h               |   16 +-
 fw/http2.c              |  556 +++++++++++++++++++++
 fw/http2.h              |  167 +++++++
 fw/http_frame.c         | 1054 ++++++++++++++++++---------------------
 fw/http_frame.h         |  172 +------
 fw/http_msg.c           |   20 +-
 fw/http_msg.h           |    9 +-
 fw/http_sess.c          |   10 +-
 fw/http_sess.h          |    3 +-
 fw/http_stream.c        |  308 +++++++-----
 fw/http_stream.h        |  169 +++++--
 fw/http_stream_sched.c  |  616 +++++++++++++++++++++++
 fw/http_stream_sched.h  |   95 ++++
 fw/http_types.h         |    7 +-
 fw/main.c               |    7 +-
 fw/msg.c                |   36 --
 fw/msg.h                |   18 +-
 fw/sock.c               |  204 +++++---
 fw/sock_clnt.c          |  315 +-----------
 fw/ss_skb.c             |   11 +-
 fw/ss_skb.h             |  137 +++++
 fw/sync_socket.h        |   25 +-
 fw/t/unit/helpers.c     |   15 +
 fw/t/unit/http2.c       |  556 +++++++++++++++++++++
 fw/t/unit/test.c        |    6 +-
 fw/t/unit/test_ebtree.c |   52 ++
 fw/t/unit/test_hpack.c  |    3 +-
 fw/tcp.h                |   39 +-
 fw/tls.c                |   64 ++-
 fw/tls.h                |    4 +-
 fw/work_queue.c         |    4 +-
 ktest/linux/percpu.h    |    4 +-
 lib/Makefile            |    4 +-
 lib/eb64tree.c          |   35 ++
 lib/eb64tree.h          |  273 ++++++++++
 lib/ebtree.c            |   40 ++
 lib/ebtree.h            |  597 ++++++++++++++++++++++
 linux-5.10.35.patch     |  421 +++++++---------
 tls/mpool.c             |    6 +-
 tls/rsa.c               |    4 +-
 tls/ttls.c              |    6 +-
 49 files changed, 4858 insertions(+), 1942 deletions(-)
 create mode 100644 fw/http2.c
 create mode 100644 fw/http2.h
 create mode 100644 fw/http_stream_sched.c
 create mode 100644 fw/http_stream_sched.h
 create mode 100644 fw/t/unit/http2.c
 create mode 100644 fw/t/unit/test_ebtree.c
 create mode 100755 lib/eb64tree.c
 create mode 100755 lib/eb64tree.h
 create mode 100755 lib/ebtree.c
 create mode 100755 lib/ebtree.h

diff --git a/Makefile b/Makefile
index 82428c5e2..c23a2aae4 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 #		Tempesta FW
 #
 # Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com).
-# Copyright (C) 2015-2022 Tempesta Technologies, Inc.
+# Copyright (C) 2015-2024 Tempesta Technologies, Inc.
 #
 # This program is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by
@@ -73,6 +73,7 @@ DBG_WS ?= 0
 DBG_APM ?= 0
 DBG_GFSM ?= 0
 DBG_HTTP ?= 0
+DBG_HTTP2 ?= 0
 DBG_HTTP_FRAME ?= 0
 DBG_HTTP_SESS ?= 0
 DBG_HTTP_STREAM ?= 0
@@ -89,6 +90,7 @@ TFW_CFLAGS += -DDBG_HTTP_SESS=$(DBG_HTTP_SESS)
 TFW_CFLAGS += -DDBG_HTTP_STREAM=$(DBG_HTTP_STREAM)
 TFW_CFLAGS += -DDBG_HPACK=$(DBG_HPACK) -DDBG_CACHE=$(DBG_CACHE)
 TFW_CFLAGS += -DDBG_SRV=$(DBG_SRV) -DDBG_VHOST=$(DBG_VHOST) -DDBG_TEST=$(DBG_TEST)
+TFW_CFLAGS += -DDBG_HTTP2=$(DBG_HTTP2)
 
 # By default Tempesta TLS randomizes elliptic curve points using RDRAND
 # instruction, which provides a high speed random numbers generator.
diff --git a/db/core/htrie.c b/db/core/htrie.c
index 7f9bef877..d7087059f 100644
--- a/db/core/htrie.c
+++ b/db/core/htrie.c
@@ -11,7 +11,7 @@
  * and shutdown are performed in process context.
  *
  * Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com).
- * Copyright (C) 2015-2021 Tempesta Technologies, Inc.
+ * Copyright (C) 2015-2024 Tempesta Technologies, Inc.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by
@@ -918,7 +918,7 @@ tdb_htrie_init(void *p, size_t db_size, unsigned int rec_len)
 		TDB_ERR("cannot allocate per-cpu data\n");
 		return NULL;
 	}
-	for_each_possible_cpu(cpu) {
+	for_each_online_cpu(cpu) {
 		TdbPerCpu *p = per_cpu_ptr(hdr->pcpu, cpu);
 		p->i_wcl = tdb_alloc_blk(hdr);
 		p->d_wcl = tdb_alloc_blk(hdr);
diff --git a/fw/cache.c b/fw/cache.c
index b6ea82441..f855dd059 100644
--- a/fw/cache.c
+++ b/fw/cache.c
@@ -26,6 +26,7 @@
 #include <linux/kthread.h>
 #include <linux/tcp.h>
 #include <linux/topology.h>
+#include <linux/nodemask.h>
 
 #undef DEBUG
 #if DBG_CACHE > 0
@@ -234,13 +235,13 @@ enum {
 };
 
 typedef struct {
-	int		cpu[NR_CPUS];
+	int 		*cpu;
 	atomic_t	cpu_idx;
 	unsigned int	nr_cpus;
 	TDB		*db;
 } CaNode;
 
-static CaNode c_nodes[MAX_NUMNODES];
+static CaNode *c_nodes;
 
 typedef int tfw_cache_write_actor_t(TDB *, TdbVRec **, TfwHttpResp *, char **,
 				    size_t, TfwDecodeCacheIter *);
@@ -333,18 +334,59 @@ tfw_cache_key_node(unsigned long key)
 }
 
 /**
- * Just choose any CPU for each node to use queue_work_on() for
- * nodes scheduling. Reserve 0th CPU for other tasks.
+ * Release node-cpu map.
  */
 static void
+tfw_release_node_cpus(void)
+{
+	int node;
+
+	if(!c_nodes)
+		return;
+
+	for(node = 0; node < nr_online_nodes; node++) {
+		if(c_nodes[node].cpu)
+			kfree(c_nodes[node].cpu);
+	}
+	kfree(c_nodes);
+}
+
+/**
+ * Create node-cpu map to use queue_work_on() for nodes scheduling.
+ * 0th CPU is reserved for other tasks.
+ * At the moment we doesn't support CPU hotplug, so enumerate only online CPUs.
+ */
+static int
 tfw_init_node_cpus(void)
 {
-	int cpu, node;
+	int nr_cpus, cpu, node;
+
+	T_DBG2("nr_online_nodes: %d", nr_online_nodes);
+
+	c_nodes = kzalloc(nr_online_nodes * sizeof(CaNode), GFP_KERNEL);
+	if(!c_nodes) {
+		T_ERR("Failed to allocate nodes map for cache work scheduler");
+		return -ENOMEM;
+	}
+
+	for_each_node_with_cpus(node) {
+		nr_cpus = nr_cpus_node(node);
+		T_DBG2("node: %d  nr_cpus: %d",node, nr_cpus);
+		c_nodes[node].cpu = kmalloc(nr_cpus * sizeof(int), GFP_KERNEL);
+		if(!c_nodes[node].cpu) {
+			T_ERR("Failed to allocate CPU array for node %d for cache work scheduler",
+				node);
+			return -ENOMEM;
+		}
+	}
 
 	for_each_online_cpu(cpu) {
 		node = cpu_to_node(cpu);
+		T_DBG2("node: %d  cpu: %d",node, cpu);
 		c_nodes[node].cpu[c_nodes[node].nr_cpus++] = cpu;
 	}
+
+	return 0;
 }
 
 static TDB *
@@ -976,8 +1018,7 @@ tfw_cache_send_304(TfwHttpReq *req, TfwCacheEntry *ce)
 
 		resp->mit.start_off = FRAME_HEADER_SIZE;
 
-		r = tfw_h2_resp_status_write(resp, 304, false, true,
-					     stream_id);
+		r = tfw_h2_resp_status_write(resp, 304, false, true);
 		if (unlikely(r))
 			goto err_setup;
 		/* account for :status field itself */
@@ -1018,7 +1059,7 @@ tfw_cache_send_304(TfwHttpReq *req, TfwCacheEntry *ce)
 		return;
 	}
 
-	if (tfw_h2_frame_local_resp(resp, stream_id, h_len, NULL))
+	if (tfw_h2_frame_local_resp(resp, h_len, NULL))
 		goto err_setup;
 
 	tfw_h2_req_unlink_stream(req);
@@ -2624,7 +2665,7 @@ tfw_cache_add_body_page(TfwMsgIter *it, char *p, int sz, bool h2,
  */
 static int
 tfw_cache_build_resp_body(TDB *db, TdbVRec *trec, TfwMsgIter *it, char *p,
-			  unsigned long body_sz, bool h2, unsigned int stream_id)
+			  unsigned long body_sz, bool h2)
 {
 	int r;
 	bool sh_frag = h2 ? false : true;
@@ -2660,10 +2701,6 @@ tfw_cache_build_resp_body(TDB *db, TdbVRec *trec, TfwMsgIter *it, char *p,
 						    !body_sz);
 			if (r)
 				return r;
-			if (stream_id) {
-				skb_set_tfw_flags(it->skb, SS_F_HTTT2_FRAME_DATA);
-				skb_set_tfw_cb(it->skb, stream_id);
-			}
 		}
 		if (!body_sz || !(trec = tdb_next_rec_chunk(db, trec)))
 			break;
@@ -2686,8 +2723,7 @@ tfw_cache_build_resp_body(TDB *db, TdbVRec *trec, TfwMsgIter *it, char *p,
 }
 
 static int
-tfw_cache_set_hdr_age(TfwHttpResp *resp, TfwCacheEntry *ce,
-		      unsigned int stream_id)
+tfw_cache_set_hdr_age(TfwHttpResp *resp, TfwCacheEntry *ce)
 {
 	int r;
 	size_t digs;
@@ -2718,8 +2754,7 @@ tfw_cache_set_hdr_age(TfwHttpResp *resp, TfwCacheEntry *ce,
 
 	if (to_h2) {
 		h_age.hpack_idx = 21;
-		if ((r = tfw_hpack_encode(resp, &h_age, false, false,
-					  stream_id)))
+		if ((r = tfw_hpack_encode(resp, &h_age, false, false)))
 			goto err;
 	} else {
 		if ((r = tfw_http_msg_expand_data(&mit->iter, skb_head,
@@ -2761,8 +2796,7 @@ tfw_cache_set_hdr_age(TfwHttpResp *resp, TfwCacheEntry *ce,
  * TODO use iterator and passed skbs to be called from net_tx_action.
  */
 static TfwHttpResp *
-tfw_cache_build_resp(TfwHttpReq *req, TfwCacheEntry *ce, long lifetime,
-		     unsigned int stream_id)
+tfw_cache_build_resp(TfwHttpReq *req, TfwCacheEntry *ce, long lifetime)
 {
 	int h;
 	TfwStr dummy_body = { 0 };
@@ -2821,14 +2855,14 @@ tfw_cache_build_resp(TfwHttpReq *req, TfwCacheEntry *ce, long lifetime,
 	 * Set 'set-cookie' header if needed, for HTTP/2 or HTTP/1.1
 	 * response.
 	 */
-	if (tfw_http_sess_resp_process(resp, true, stream_id))
+	if (tfw_http_sess_resp_process(resp, true))
 		goto free;
 	/*
 	 * RFC 7234 p.4 Constructing Responses from Caches:
 	 * When a stored response is used to satisfy a request without
 	 * validation, a cache MUST generate an Age header field.
 	 */
-	if (tfw_cache_set_hdr_age(resp, ce, stream_id))
+	if (tfw_cache_set_hdr_age(resp, ce))
 		goto free;
 
 	if (!TFW_MSG_H2(req)) {
@@ -2856,11 +2890,11 @@ tfw_cache_build_resp(TfwHttpReq *req, TfwCacheEntry *ce, long lifetime,
 	}
 
 	/* Set additional headers for HTTP/2 response. */
-	if (tfw_h2_resp_add_loc_hdrs(resp, h_mods, true, stream_id)
+	if (tfw_h2_resp_add_loc_hdrs(resp, h_mods, true)
 	    || (lifetime > ce->lifetime
-		&& tfw_h2_set_stale_warn(resp, stream_id))
+		&& tfw_h2_set_stale_warn(resp))
 	    || (!test_bit(TFW_HTTP_B_HDR_DATE, resp->flags)
-		&& tfw_h2_add_hdr_date(resp, true, stream_id)))
+		&& tfw_h2_add_hdr_date(resp, true)))
 		goto free;
 
 	h_len += mit->acc_len;
@@ -2881,7 +2915,7 @@ tfw_cache_build_resp(TfwHttpReq *req, TfwCacheEntry *ce, long lifetime,
 	 * send content in the response.
 	 */
 	dummy_body.len = req->method != TFW_HTTP_METH_HEAD ? ce->body_len : 0;
-	if (tfw_h2_frame_local_resp(resp, stream_id, h_len, &dummy_body))
+	if (tfw_h2_frame_local_resp(resp, h_len, &dummy_body))
 		goto free;
 	it->skb = ss_skb_peek_tail(&it->skb_head);
 	it->frag = skb_shinfo(it->skb)->nr_frags - 1;
@@ -2891,7 +2925,7 @@ tfw_cache_build_resp(TfwHttpReq *req, TfwCacheEntry *ce, long lifetime,
 	BUG_ON(p != TDB_PTR(db->hdr, ce->body));
 	if (ce->body_len && req->method != TFW_HTTP_METH_HEAD) {
 		if (tfw_cache_build_resp_body(db, trec, it, p, ce->body_len,
-					      TFW_MSG_H2(req), stream_id))
+					      TFW_MSG_H2(req)))
 			goto free;
 	}
 	resp->content_length = ce->body_len;
@@ -2952,8 +2986,7 @@ cache_req_process_node(TfwHttpReq *req, tfw_http_cache_cb_t action)
 		}
 	}
 
-	resp = tfw_cache_build_resp(req, ce, lifetime, id);
-
+	resp = tfw_cache_build_resp(req, ce, lifetime);
 	/*
 	 * The stream of HTTP/2-request should be closed here since we have
 	 * successfully created the resulting response from cache and will
@@ -3162,6 +3195,31 @@ tfw_cache_mgr(void *arg)
 }
 #endif
 
+static inline int
+tfw_cache_wq_init(int cpu)
+{
+	TfwWorkTasklet *ct = &per_cpu(cache_wq, cpu);
+	int r;
+
+	r = tfw_wq_init(&ct->wq, TFW_DFLT_QSZ, cpu_to_node(cpu));
+	if (unlikely(r))
+		return r;
+	init_irq_work(&ct->ipi_work, tfw_cache_ipi);
+	tasklet_init(&ct->tasklet, tfw_wq_tasklet, (unsigned long)ct);
+
+	return 0;
+}
+
+static inline void
+tfw_cache_wq_clear(int cpu)
+{
+	TfwWorkTasklet *ct = &per_cpu(cache_wq, cpu);
+
+	tasklet_kill(&ct->tasklet);
+	irq_work_sync(&ct->ipi_work);
+	tfw_wq_destroy(&ct->wq);
+}
+
 static int
 tfw_cache_start(void)
 {
@@ -3173,11 +3231,16 @@ tfw_cache_start(void)
 	if (!(cache_cfg.cache || g_vhost->cache_purge))
 		return 0;
 
-	for_each_node_with_cpus(i) {
+	if ((r = tfw_init_node_cpus()))
+		goto node_cpus_alloc_err;
+
+	for(i = 0; i < nr_online_nodes; i++) {
 		c_nodes[i].db = tdb_open(cache_cfg.db_path,
 					 cache_cfg.db_size, 0, i);
-		if (!c_nodes[i].db)
+		if (!c_nodes[i].db) {
+			r = -ENOMEM;
 			goto close_db;
+		}
 	}
 #if 0
 	cache_mgr_thr = kthread_run(tfw_cache_mgr, NULL, "tfw_cache_mgr");
@@ -3187,19 +3250,14 @@ tfw_cache_start(void)
 		goto close_db;
 	}
 #endif
-	tfw_init_node_cpus();
 
 	TFW_WQ_CHECKSZ(TfwCWork);
 	for_each_online_cpu(i) {
-		TfwWorkTasklet *ct = &per_cpu(cache_wq, i);
-		r = tfw_wq_init(&ct->wq, TFW_DFLT_QSZ, cpu_to_node(i));
-		if (r) {
-			T_ERR_NL("%s: Can't initialize cache work queue for CPU #%d\n",
-				 __func__, i);
-			goto close_db;
+		if (unlikely(r = tfw_cache_wq_init(i))) {
+			T_ERR_NL("%s: Can't initialize cache work"
+				 " queue for CPU #%d\n", __func__, i);
+			goto free_tasklet;
 		}
-		init_irq_work(&ct->ipi_work, tfw_cache_ipi);
-		tasklet_init(&ct->tasklet, tfw_wq_tasklet, (unsigned long)ct);
 	}
 
 #if defined(DEBUG)
@@ -3221,9 +3279,15 @@ tfw_cache_start(void)
 	for_each_online_cpu(i)
 		kfree(per_cpu(ce_dbg_buf, i));
 #endif
+free_tasklet:
+	for_each_online_cpu(i)
+		tfw_cache_wq_clear(i);
 close_db:
 	for_each_node_with_cpus(i)
 		tdb_close(c_nodes[i].db);
+
+node_cpus_alloc_err:
+	tfw_release_node_cpus();
 	return r;
 }
 
@@ -3237,12 +3301,8 @@ tfw_cache_stop(void)
 	if (!cache_cfg.cache)
 		return;
 
-	for_each_online_cpu(i) {
-		TfwWorkTasklet *ct = &per_cpu(cache_wq, i);
-		tasklet_kill(&ct->tasklet);
-		irq_work_sync(&ct->ipi_work);
-		tfw_wq_destroy(&ct->wq);
-	}
+	for_each_online_cpu(i)
+		tfw_cache_wq_clear(i);
 #if 0
 	kthread_stop(cache_mgr_thr);
 #endif
@@ -3254,6 +3314,8 @@ tfw_cache_stop(void)
 
 	for_each_node_with_cpus(i)
 		tdb_close(c_nodes[i].db);
+
+	tfw_release_node_cpus();
 }
 
 static const TfwCfgEnum cache_http_methods_enum[] = {
diff --git a/fw/cfg.h b/fw/cfg.h
index 149b4b43b..b8f672d16 100644
--- a/fw/cfg.h
+++ b/fw/cfg.h
@@ -183,13 +183,16 @@ typedef struct {
  */
 #define TFW_CFG_ENTRY_FOR_EACH_ATTR(e, idx, k, v)	\
 	for ((idx) = 0, (k) = (e)->attrs[0].key, (v) = (e)->attrs[0].val; \
-	     (idx++) < (e)->attr_n; \
-	     (k) = (e)->attrs[(idx)].key, (v) = (e)->attrs[(idx)].val)
+	     (idx) < (e)->attr_n; \
+	     (idx)++,  \
+	     (k) = (idx < (e)->attr_n ? (e)->attrs[(idx)].key : NULL), \
+	     (v) = (idx < (e)->attr_n ? (e)->attrs[(idx)].val : NULL))
 
 #define TFW_CFG_ENTRY_FOR_EACH_VAL(e, idx, v)	\
 	for ((idx) = 0, (v) = (e)->vals[0];	\
 	     (idx) < (e)->val_n;		\
-	     (v) = (e)->vals[++(idx)])
+	     (idx)++,				\
+	     (v) = (idx < (e)->val_n ? (e)->vals[(idx)] : NULL))
 
 #define TFW_CFG_CHECK_NO_ATTRS(spec, entry)				\
 	if ((entry)->attr_n) {						\
diff --git a/fw/connection.h b/fw/connection.h
index 7a69955bb..bf1bc3422 100644
--- a/fw/connection.h
+++ b/fw/connection.h
@@ -28,7 +28,7 @@
 #include "gfsm.h"
 #include "peer.h"
 #include "sync_socket.h"
-#include "http_frame.h"
+#include "http2.h"
 #include "tls.h"
 
 /* We account users with FRANG_FREQ frequency per second. */
@@ -292,7 +292,7 @@ typedef struct {
  */
 #define tfw_h2_context_unsafe(conn)	((TfwH2Ctx *)(&((TfwH2Conn *)conn)->h2))
 #define tfw_h2_context_safe(conn)	\
-	ttls_hs_done(tfw_tls_context(conn)) ? tfw_h2_context_unsafe(conn) : NULL;
+	ttls_hs_done(tfw_tls_context(conn)) ? tfw_h2_context_unsafe(conn) : NULL
 
 
 /* Callbacks used by l5-l7 protocols to operate on connection level. */
@@ -548,8 +548,8 @@ tfw_connection_unlink_from_sk(struct sock *sk)
 
 	sk->sk_data_ready = NULL;
 	sk->sk_state_change = NULL;
-	sk->sk_prepare_xmit = NULL;
 	sk->sk_write_xmit = NULL;
+	sk->sk_fill_write_queue = NULL;
 	sk->sk_destroy_cb = NULL;
 
 	sk->sk_user_data = NULL;
diff --git a/fw/hpack.c b/fw/hpack.c
index 115e069b7..c8b9a7b15 100644
--- a/fw/hpack.c
+++ b/fw/hpack.c
@@ -1116,7 +1116,6 @@ tfw_hpack_init(TfwHPack *__restrict hp, unsigned int htbl_sz)
 		goto err_dt;
 
 	et->window = htbl_sz;
-	spin_lock_init(&et->lock);
 	et->rb_size = HPACK_ENC_TABLE_MAX_SIZE;
 	if (!(et->pool = __tfw_pool_new(HPACK_ENC_TABLE_MAX_SIZE)))
 		goto err_et;
@@ -1387,7 +1386,50 @@ tfw_hpack_hdr_set(TfwHPack *__restrict hp, TfwHttpReq *__restrict req,
 
 	return 0;
 }
+static int
+process_h2_trailer_hdr(TfwHttpMsg *hm, TfwStr *hdr, int tag)
+{
+	/*
+	 * RFC 7230 4.1.2:
+	 *
+	 * A sender MUST NOT generate a trailer that contains a field necessary
+	 * for message framing (e.g., Transfer-Encoding and Content-Length),
+	 * routing (e.g., Host), request modifiers (e.g., controls and
+	 * conditionals in Section 5 of [RFC7231]), authentication (e.g., see
+	 * [RFC7235] and [RFC6265]), response control data (e.g., see Section
+	 * 7.1 of [RFC7231]), or determining how to process the payload (e.g.,
+	 * Content-Encoding, Content-Type, Content-Range, and Trailer).
+	 *
+	 * RFC 9113 8.1:
+	 *
+	 * Trailers MUST NOT include pseudo-header fields.
+	 */
+	switch (tag) {
+	case TFW_TAG_HDR_H2_STATUS:
+	case TFW_TAG_HDR_H2_METHOD:
+	case TFW_TAG_HDR_H2_SCHEME:
+	case TFW_TAG_HDR_H2_AUTHORITY:
+	case TFW_TAG_HDR_H2_PATH:
+	case TFW_TAG_HDR_ACCEPT:
+	case TFW_TAG_HDR_AUTHORIZATION:
+	case TFW_TAG_HDR_CACHE_CONTROL:
+	case TFW_TAG_HDR_CONTENT_ENCODING:
+	case TFW_TAG_HDR_CONTENT_LENGTH:
+	case TFW_TAG_HDR_CONTENT_TYPE:
+	case TFW_TAG_HDR_COOKIE:
+	case TFW_TAG_HDR_IF_NONE_MATCH:
+	case TFW_TAG_HDR_HOST:
+	case TFW_TAG_HDR_IF_MODIFIED_SINCE:
+	case TFW_TAG_HDR_REFERER:
+	case TFW_TAG_HDR_USER_AGENT:
+		return T_BLOCK;
+	}
+
+	hdr->flags |= TFW_STR_TRAILER;
+	__set_bit(TFW_HTTP_B_CHUNKED_TRAILER, hm->flags);
 
+	return T_OK;
+}
 
 /*
  * HPACK decoder FSM for HTTP/2 message processing.
@@ -1645,6 +1687,17 @@ tfw_hpack_decode(TfwHPack *__restrict hp, unsigned char *__restrict src,
 			if ((r = frang_http_hdr_limit(req, parser->hdr.len)))
 				goto out;
 
+			/*
+			 * We check trailers here instead of __h2_msg_verify()
+			 * because in case of indexed headers
+			 * this function is not called.
+			 */
+			if (test_bit(TFW_HTTP_B_HEADERS_PARSED, req->flags))
+				if ((r = process_h2_trailer_hdr((TfwHttpMsg*)req,
+							      &(parser->hdr),
+							      it->tag)))
+					goto out;
+
 			if (state & HPACK_FLAGS_ADD
 			    && (r = tfw_hpack_add_index(&hp->dec_tbl, it,
 							&parser->cstate)))
@@ -1685,6 +1738,17 @@ tfw_hpack_decode(TfwHPack *__restrict hp, unsigned char *__restrict src,
 			if ((r = frang_http_hdr_limit(req, entry->hdr->len)))
 				goto out;
 
+			/*
+			 * We check trailers here instead of __h2_msg_verify()
+			 * because in case of indexed headers
+			 * this function is not called.
+			 */
+			if (test_bit(TFW_HTTP_B_HEADERS_PARSED, req->flags))
+				if ((r = process_h2_trailer_hdr((TfwHttpMsg*)req,
+							      entry->hdr,
+							      entry->tag)))
+					goto out;
+
 			if ((r = tfw_hpack_hdr_set(hp, req, entry)))
 				goto out;
 
@@ -3097,12 +3161,6 @@ tfw_hpack_encoder_index(TfwHPackETbl *__restrict tbl,
 	if (WARN_ON_ONCE(!hdr))
 		return -EINVAL;
 
-	spin_lock(&tbl->lock);
-
-	if (!test_bit(TFW_HTTP_B_H2_TRANS_ENTERED, flags)
-	    && atomic64_read(&tbl->guard) < 0)
-		goto out;
-
 	tfw_http_hdr_split(hdr, &h_name, &h_val, spcolon);
 	if (WARN_ON_ONCE(TFW_STR_EMPTY(&h_name)))
 		return -EINVAL;
@@ -3112,76 +3170,13 @@ tfw_hpack_encoder_index(TfwHPackETbl *__restrict tbl,
 
 	*out_index = HPACK_NODE_GET_INDEX(tbl, node);
 
-	/*
-	 * Encoder dynamic index can be in three states: initial state (@guard
-	 * is zero), read state (@guard is 1 or greater), and write state
-	 * (@guard is -1); in read state any thread can search in index, but
-	 * nobody can add or evict entries in index; if index in the write state
-	 * only one thread (current writer) can add/evict entries in index and
-	 * nobody can search in index; index can be switched to write state
-	 * only from initial state (in general case) or from read state (if
-	 * current reader is the sole read owner of the index).
-	 */
-	if (!test_bit(TFW_HTTP_B_H2_TRANS_ENTERED, flags)) {
-		if(res != HPACK_IDX_ST_FOUND
-		   && !atomic64_read(&tbl->guard)
-		   && !tfw_hpack_add_node(tbl, hdr, &place, spcolon))
-		{
-			res |= HPACK_IDX_FLAG_ADD;
-			atomic64_set(&tbl->guard, -1);
-			__set_bit(TFW_HTTP_B_H2_TRANS_ENTERED, flags);
-		}
-		else if (res != HPACK_IDX_ST_NOT_FOUND)
-		{
-			atomic64_inc(&tbl->guard);
-			__set_bit(TFW_HTTP_B_H2_TRANS_ENTERED, flags);
-		}
-	}
-	else {
-		/*
-		 * If value of guard is 1, we are the sole owner of the encoder
-		 * dynamic index with read rights, thus we can write to it.
-		 * Note, that @guard cannot be zero here, since we are already
-		 * owning encoder index with read or write rights (i.e. the flag
-		 * @TFW_HTTP_B_H2_TRANS_ENTERED is set for the corrently
-		 * processed message), thus we have already set the @guard
-		 * equal to 1 (or greater) or to -1 before.
-		 */
-		WARN_ON_ONCE(!atomic64_read(&tbl->guard));
-		if (res != HPACK_IDX_ST_FOUND
-		    && atomic64_read(&tbl->guard) <= 1
-		    && !tfw_hpack_add_node(tbl, hdr, &place, spcolon))
-		{
-			res |= HPACK_IDX_FLAG_ADD;
-			atomic64_set(&tbl->guard, -1);
-		}
-	}
-
-out:
-	spin_unlock(&tbl->lock);
+	if(res != HPACK_IDX_ST_FOUND
+	   && !tfw_hpack_add_node(tbl, hdr, &place, spcolon))
+		res |= HPACK_IDX_FLAG_ADD;
 
 	return res;
 }
 
-void
-tfw_hpack_enc_release(TfwHPack *__restrict hp, unsigned long *flags)
-{
-	TfwHPackETbl *tbl = &hp->enc_tbl;
-
-	if (!test_bit(TFW_HTTP_B_H2_TRANS_ENTERED, flags))
-		return;
-
-	if (atomic64_read(&tbl->guard) < 0) {
-		atomic64_set(&tbl->guard, 0);
-	}
-	else {
-		WARN_ON_ONCE(!atomic64_read(&tbl->guard));
-		atomic64_dec(&tbl->guard);
-	}
-
-	__clear_bit(TFW_HTTP_B_H2_TRANS_ENTERED, flags);
-}
-
 static unsigned long
 tfw_huffman_encode_string_len(TfwStr *str)
 {
@@ -3552,17 +3547,16 @@ tfw_hpack_hdr_expand(TfwHttpResp *__restrict resp, TfwStr *__restrict hdr,
  */
 static int
 __tfw_hpack_encode(TfwHttpResp *__restrict resp, TfwStr *__restrict hdr,
-		   bool use_pool, bool dyn_indexing, bool trans,
-		   unsigned int stream_id)
+		   bool use_pool, bool dyn_indexing, bool trans)
 {
 	TfwHPackInt idx;
 	bool st_full_index;
 	unsigned short st_index, index = 0;
-	TfwH2Ctx *ctx = tfw_h2_context_unsafe(resp->req->conn);
+	TfwConn *conn = resp->req->conn;
+	TfwH2Ctx *ctx = tfw_h2_context_unsafe(conn);
 	TfwHPackETbl *tbl = &ctx->hpack.enc_tbl;
 	int r = HPACK_IDX_ST_NOT_FOUND;
 	bool name_indexed = true;
-	struct sk_buff *skb = resp->mit.iter.skb;
 
 	if (WARN_ON_ONCE(!hdr || TFW_STR_EMPTY(hdr)))
 		return -EINVAL;
@@ -3575,6 +3569,7 @@ __tfw_hpack_encode(TfwHttpResp *__restrict resp, TfwStr *__restrict hdr,
 	T_DBG_PRINT_HPACK_RBTREE(tbl);
 
 	if (!st_full_index && dyn_indexing) {
+		assert_spin_locked(&conn->sk->sk_lock.slock);
 		r = tfw_hpack_encoder_index(tbl, hdr, &index, resp->flags,
 					    trans);
 		if (r < 0)
@@ -3599,7 +3594,7 @@ __tfw_hpack_encode(TfwHttpResp *__restrict resp, TfwStr *__restrict hdr,
 			return r;
 
 		resp->mit.acc_len += idx.sz * !use_pool;
-		goto set_skb_priv;
+		return 0;
 	}
 
 	if (st_index || HPACK_IDX_RES(r) == HPACK_IDX_ST_NM_FOUND) {
@@ -3632,30 +3627,14 @@ __tfw_hpack_encode(TfwHttpResp *__restrict resp, TfwStr *__restrict hdr,
 		r = tfw_hpack_hdr_add(resp, hdr, &idx, name_indexed, trans);
 	else
 		r = tfw_hpack_hdr_expand(resp, hdr, &idx, name_indexed);
-set_skb_priv:
-	if (likely(!r) && stream_id) {
-		/*
-		 * Very long headers can be located in several skbs,
-		 * mark them all.
-		 */
-		while(skb && unlikely(skb != resp->mit.iter.skb)) {
-			skb_set_tfw_flags(skb, SS_F_HTTT2_FRAME_HEADERS);
-			skb_set_tfw_cb(skb, stream_id);
-			skb = skb->next;
-		}
-
-		skb_set_tfw_flags(resp->mit.iter.skb, SS_F_HTTT2_FRAME_HEADERS);
-		skb_set_tfw_cb(resp->mit.iter.skb, stream_id);
-	}
 	return r;
 }
 
 int
 tfw_hpack_encode(TfwHttpResp *__restrict resp, TfwStr *__restrict hdr,
-		 bool use_pool, bool dyn_indexing, unsigned int stream_id)
+		 bool use_pool, bool dyn_indexing)
 {
-	return __tfw_hpack_encode(resp, hdr, use_pool, dyn_indexing, false,
-				  stream_id);
+	return __tfw_hpack_encode(resp, hdr, use_pool, dyn_indexing, false);
 }
 
 /*
@@ -3663,10 +3642,9 @@ tfw_hpack_encode(TfwHttpResp *__restrict resp, TfwStr *__restrict hdr,
  * into the HTTP/2 HPACK format.
  */
 int
-tfw_hpack_transform(TfwHttpResp *__restrict resp, TfwStr *__restrict hdr,
-		    unsigned int stream_id)
+tfw_hpack_transform(TfwHttpResp *__restrict resp, TfwStr *__restrict hdr)
 {
-	return __tfw_hpack_encode(resp, hdr, true, true, true, stream_id);
+	return __tfw_hpack_encode(resp, hdr, true, true, true);
 }
 
 void
@@ -3678,8 +3656,6 @@ tfw_hpack_set_rbuf_size(TfwHPackETbl *__restrict tbl, unsigned short new_size)
 		new_size = HPACK_ENC_TABLE_MAX_SIZE;
 	}
 
-	spin_lock(&tbl->lock);
-
 	T_DBG3("%s: tbl->rb_len=%hu, tbl->size=%hu, tbl->window=%hu,"
 	       " new_size=%hu\n", __func__, tbl->rb_len, tbl->size,
 	       tbl->window, new_size);
@@ -3692,7 +3668,7 @@ tfw_hpack_set_rbuf_size(TfwHPackETbl *__restrict tbl, unsigned short new_size)
 	 * size that occurs in that interval MUST be signaled in a dynamic
 	 * table size update.
 	 */
-	if (tbl->window != new_size && (likely(!atomic_read(&tbl->wnd_changed))
+	if (tbl->window != new_size && (likely(!tbl->wnd_changed)
 	    || unlikely(!tbl->window) || new_size < tbl->window))
 	{
 		if (tbl->size > new_size)
@@ -3701,20 +3677,16 @@ tfw_hpack_set_rbuf_size(TfwHPackETbl *__restrict tbl, unsigned short new_size)
 		WARN_ON_ONCE(tbl->rb_len > tbl->size);
 
 		tbl->window = new_size;
-		atomic_set(&tbl->wnd_changed, 1);
+		tbl->wnd_changed = true;
 	}
-
-	spin_unlock(&tbl->lock);
 }
 
 int
-tfw_hpack_enc_tbl_write_sz(TfwHPackETbl *__restrict tbl, struct sock *sk,
-			   struct sk_buff *skb, TfwStream *stream,
-			   unsigned int mss_now, unsigned int *t_tz)
+tfw_hpack_enc_tbl_write_sz(TfwHPackETbl *__restrict tbl, TfwStream *stream)
 {
 	TfwMsgIter it = {
-		.skb = skb,
-		.skb_head = ((struct sk_buff *)&sk->sk_write_queue),
+		.skb = stream->xmit.skb_head,
+		.skb_head = stream->xmit.skb_head,
 		.frag = -1
 	};
 	TfwStr new_size = {};
@@ -3722,51 +3694,23 @@ tfw_hpack_enc_tbl_write_sz(TfwHPackETbl *__restrict tbl, struct sock *sk,
 	char *data;
 	int r = 0;
 
-	/*
-	 * We should encode hpack dynamic table size, only in case when
-	 * it was changed and only once.
-	 */
-	if (unlikely(atomic_cmpxchg(&tbl->wnd_changed, 1, -1) == 1)) {
-		write_int(tbl->window, 0x1F, 0x20, &tmp);
-		new_size.data = tmp.buf;
-		new_size.len = tmp.sz;
-	
-		data = tfw_http_iter_set_at_skb(&it, skb, FRAME_HEADER_SIZE);
-		if (!data) {
-			r = -E2BIG;
-			goto finish;
-		}
+	WARN_ON_ONCE(!tbl->wnd_changed);
 
-		r = tfw_h2_insert_frame_header(sk, skb, stream, mss_now, &it,
-					       &data, &new_size, t_tz);
-		if (unlikely(r))
-			goto finish;
+	write_int(tbl->window, 0x1F, 0x20, &tmp);
+	new_size.data = tmp.buf;
+	new_size.len = tmp.sz;
 
-		stream->xmit.h_len += tmp.sz;
-	}
+	data = ss_skb_data_ptr_by_offset(stream->xmit.skb_head,
+					 FRAME_HEADER_SIZE);
+	BUG_ON(!data);
 
-finish:
+	r = tfw_http_msg_insert(&it, &data, &new_size);
 	if (unlikely(r))
-		/*
-		 * In case of error we should restore value of `wnd_changed`
-		 * flag.
-		 */
-		atomic_set(&tbl->wnd_changed, 1);
-	return r;	
-}
+		return r;
 
-void
-tfw_hpack_enc_tbl_write_sz_release(TfwHPackETbl *__restrict tbl, int r)
-{
-	/*
-	 * Before calling this function, we should check that we encode
-	 * new dynamic table size into the frame, so `old` can have only
-	 * two values (-1 in most of all cases, since we set it previosly
-	 * or 1 if changing of dynamic table size was occured, before this
-	 * function is called).
-	 * We should change this flag only if it wasn't changed by
-	 * `tfw_hpack_set_rbuf_size` function.
-	 */
-	int old = atomic_cmpxchg(&tbl->wnd_changed, -1, r == 0 ? 0 : 1);
-	WARN_ON_ONCE(!old);
+	stream->xmit.h_len += tmp.sz;
+	tbl->wnd_changed = false;
+
+	return 0;
 }
+
diff --git a/fw/hpack.h b/fw/hpack.h
index 558810d57..010e2f9a1 100644
--- a/fw/hpack.h
+++ b/fw/hpack.h
@@ -90,32 +90,21 @@ typedef struct {
  *
  * @window	- maximum pseudo-length of the dynamic table (in bytes); this
  *		  value used as threshold to flushing old entries;
- * @wnd_changed	- flag indicates, that window was changed by settings update,
- * 		- can be in three states:
- * 		- 0 in case when window size isn't changed.
- * 		- 1 in case when window size is changed and it should be written
- * 		  into the first response, before the first header block.
- * 		- -1 in case when window size is written into the first response,
- * 		  but this response was not sent to a client yet.
+ * @wnd_changed - flag indicates, that window was changed by settings update;
  * @rbuf	- pointer to the ring buffer;
  * @root	- pointer to the root node of binary tree;
  * @pool	- memory pool for dynamic table;
  * @idx_acc	- current accumulated index, intended for real indexes
  *		  calculation;
- * @guard	- atomic protection against races during entries
- *		  addition/eviction in encoder dynamic index;
- * @lock	- spinlock to synchronize concurrent access to encoder index.
  */
 typedef struct {
 	TFW_HPACK_ETBL_COMMON;
 	unsigned short		window;
-	atomic_t		wnd_changed;
+	bool			wnd_changed;
 	char			*rbuf;
 	TfwHPackNode		*root;
 	TfwPool			*pool;
 	unsigned long		idx_acc;
-	atomic64_t		guard;
-	spinlock_t		lock;
 } TfwHPackETbl;
 
 /**
@@ -314,10 +303,9 @@ void write_int(unsigned long index, unsigned short max, unsigned short mask,
 	       TfwHPackInt *__restrict res_idx);
 int tfw_hpack_init(TfwHPack *__restrict hp, unsigned int htbl_sz);
 void tfw_hpack_clean(TfwHPack *__restrict hp);
-int tfw_hpack_transform(TfwHttpResp *__restrict resp, TfwStr *__restrict hdr,
-			unsigned int stream_id);
+int tfw_hpack_transform(TfwHttpResp *__restrict resp, TfwStr *__restrict hdr);
 int tfw_hpack_encode(TfwHttpResp *__restrict resp, TfwStr *__restrict hdr,
-		     bool use_pool, bool dyn_indexing, unsigned int stream_id);
+		     bool use_pool, bool dyn_indexing);
 void tfw_hpack_set_rbuf_size(TfwHPackETbl *__restrict tbl,
 			     unsigned short new_size);
 int tfw_hpack_decode(TfwHPack *__restrict hp, unsigned char *__restrict src,
@@ -327,11 +315,7 @@ int tfw_hpack_cache_decode_expand(TfwHPack *__restrict hp,
 				  TfwHttpResp *__restrict resp,
 				  unsigned char *__restrict src, unsigned long n,
 				  TfwDecodeCacheIter *__restrict cd_iter);
-void tfw_hpack_enc_release(TfwHPack *__restrict hp, unsigned long *flags);
-int tfw_hpack_enc_tbl_write_sz(TfwHPackETbl *__restrict tbl, struct sock *sk,
-			       struct sk_buff *skb, TfwStream *stream,
-			       unsigned int mss_now, unsigned int *t_tz);
-void tfw_hpack_enc_tbl_write_sz_release(TfwHPackETbl *__restrict tbl, int r);
+int tfw_hpack_enc_tbl_write_sz(TfwHPackETbl *__restrict tbl, TfwStream *stream);
 
 static inline unsigned int
 tfw_hpack_int_size(unsigned long index, unsigned short max)
diff --git a/fw/http.c b/fw/http.c
index 8f0e138cb..2d711b969 100644
--- a/fw/http.c
+++ b/fw/http.c
@@ -586,10 +586,14 @@ tfw_http_resp_status_line(int status, size_t *len)
 
 /*
  * Preparing custom HTTP2 response to a client.
+ * We don't use hpack dynamic indexing in this function, because
+ * this function is used only for local responses and redirections
+ * which are used quite rarely. Also we don't use dynamic indexing
+ * for cache responses, which is much more significant (#1801). The
+ * behaviour may be changed during solving #1801.
  */
 static int
-tfw_h2_prep_resp(TfwHttpResp *resp, unsigned short status, TfwStr *msg,
-		 unsigned int stream_id)
+tfw_h2_prep_resp(TfwHttpResp *resp, unsigned short status, TfwStr *msg)
 {
 	int r, i;
 	unsigned long hdrs_len = 0;
@@ -601,16 +605,9 @@ tfw_h2_prep_resp(TfwHttpResp *resp, unsigned short status, TfwStr *msg,
 	};
 	TfwStr *body = NULL;
 
-	BUG_ON(!resp->req);
-	if (!stream_id) {
-		stream_id = tfw_h2_req_stream_id(resp->req);
-		if (unlikely(!stream_id))
-			return -EPIPE;
-	}
-
 	/* Set HTTP/2 ':status' pseudo-header. */
 	mit->start_off = FRAME_HEADER_SIZE;
-	r = tfw_h2_resp_status_write(resp, status, false, false, stream_id);
+	r = tfw_h2_resp_status_write(resp, status, false, true);
 	if (unlikely(r))
 		goto out;
 
@@ -636,7 +633,7 @@ tfw_h2_prep_resp(TfwHttpResp *resp, unsigned short status, TfwStr *msg,
 
 			__TFW_STR_CH(&hdr, 0)->hpack_idx = name->hpack_idx;
 			r = tfw_hpack_encode(resp, __TFW_STR_CH(&hdr, 0),
-					     false, false, stream_id);
+					     false, false);
 			if (unlikely(r))
 				goto out;
 
@@ -661,8 +658,7 @@ tfw_h2_prep_resp(TfwHttpResp *resp, unsigned short status, TfwStr *msg,
 				  __TFW_STR_CH(&hdr, 1)->len;
 			hdr.hpack_idx = name->hpack_idx;
 
-			if ((r = tfw_hpack_encode(resp, &hdr, false, true,
-						  stream_id)))
+			if ((r = tfw_hpack_encode(resp, &hdr, false, false)))
 				goto out;
 		}
 	}
@@ -677,7 +673,7 @@ tfw_h2_prep_resp(TfwHttpResp *resp, unsigned short status, TfwStr *msg,
 
 	body = TFW_STR_BODY_CH(msg);
 
-	r = tfw_h2_frame_local_resp(resp, stream_id, hdrs_len, body);
+	r = tfw_h2_frame_local_resp(resp, hdrs_len, body);
 
 out:
 	if (r)
@@ -867,7 +863,7 @@ do { 								\
 		};
 
 		r = TFW_MSG_H2(req)
-			? tfw_h2_prep_resp(resp, status, &msg, 0)
+			? tfw_h2_prep_resp(resp, status, &msg)
 			: tfw_h1_prep_resp(resp, status, &msg);
 
 		return r;
@@ -992,6 +988,17 @@ tfw_http_resp_pair_free(TfwHttpReq *req)
 	tfw_http_conn_msg_free((TfwHttpMsg *)req);
 }
 
+void
+tfw_http_resp_pair_free_and_put_conn(void *opaque_data)
+{
+	TfwHttpResp *resp = (TfwHttpResp *)(opaque_data);
+	TfwHttpReq *req = resp->req;
+
+	BUG_ON(!req || !req->conn);
+	tfw_connection_put(req->conn);
+	tfw_http_resp_pair_free(req);
+}
+
 /*
  * Close the client connection and free unpaired request. This function
  * is needed for cases when we cannot prepare response for this request.
@@ -1052,7 +1059,7 @@ tfw_http_enum_resp_code(int status)
  */
 int
 tfw_h2_resp_status_write(TfwHttpResp *resp, unsigned short status,
-			 bool use_pool, bool cache, unsigned int stream_id)
+			 bool use_pool, bool cache)
 {
 	int ret;
 	unsigned short index = tfw_h2_pseudo_index(status);
@@ -1078,8 +1085,7 @@ tfw_h2_resp_status_write(TfwHttpResp *resp, unsigned short status,
 	if (!tfw_ultoa(status, __TFW_STR_CH(&s_hdr, 1)->data, H2_STAT_VAL_LEN))
 		return -E2BIG;
 
-	if ((ret = tfw_hpack_encode(resp, &s_hdr, use_pool, !cache,
-				    stream_id)))
+	if ((ret = tfw_hpack_encode(resp, &s_hdr, use_pool, !cache)))
 		return ret;
 
 	/* set status on response for access logging */
@@ -1091,27 +1097,27 @@ tfw_h2_resp_status_write(TfwHttpResp *resp, unsigned short status,
 void
 tfw_h2_resp_fwd(TfwHttpResp *resp)
 {
+	bool resp_in_xmit =
+		(TFW_SKB_CB(resp->msg.skb_head)->opaque_data == resp);
 	TfwHttpReq *req = resp->req;
-	TfwH2Ctx *ctx = tfw_h2_context_unsafe(req->conn);
+	TfwConn *conn = req->conn;
 
-	tfw_connection_get(req->conn);
+	tfw_connection_get(conn);
 	do_access_log(resp);
 
-	if (tfw_cli_conn_send((TfwCliConn *)req->conn, (TfwMsg *)resp)) {
+	if (tfw_cli_conn_send((TfwCliConn *)conn, (TfwMsg *)resp)) {
 		T_DBG("%s: cannot send data to client via HTTP/2\n", __func__);
 		TFW_INC_STAT_BH(serv.msgs_otherr);
-		tfw_connection_close(req->conn, true);
-	}
-	else {
+		tfw_connection_close(conn, true);
+		/* We can't send response, so we should free it here. */
+		resp_in_xmit = false;
+	} else {
 		TFW_INC_STAT_BH(serv.msgs_forwarded);
 		tfw_inc_global_hm_stats(resp->status);
 	}
 
-	tfw_connection_put(req->conn);
-
-	tfw_hpack_enc_release(&ctx->hpack, resp->flags);
-
-	tfw_http_resp_pair_free(req);
+	if (!resp_in_xmit)
+		tfw_http_resp_pair_free_and_put_conn(resp);
 }
 
 /*
@@ -1127,14 +1133,16 @@ tfw_h2_resp_fwd(TfwHttpResp *resp)
  */
 static void
 tfw_h2_send_resp(TfwHttpReq *req, TfwStr *msg, int status,
-		 unsigned int stream_id)
+		 bool close_after_send)
 {
-	TfwH2Ctx *ctx = tfw_h2_context_unsafe(req->conn);
 	TfwHttpResp *resp = tfw_http_msg_alloc_resp_light(req);
 	if (unlikely(!resp))
 		goto err;
 
-	if (tfw_h2_prep_resp(resp, status, msg, stream_id))
+	if (close_after_send)
+		set_bit(TFW_HTTP_B_CLOSE_ERROR_RESPONSE, resp->flags);
+
+	if (tfw_h2_prep_resp(resp, status, msg))
 		goto err_setup;
 
 	/* Send resulting HTTP/2 response and release HPACK encoder index. */
@@ -1146,8 +1154,6 @@ tfw_h2_send_resp(TfwHttpReq *req, TfwStr *msg, int status,
 	T_DBG("%s: HTTP/2 response message transformation error: conn=[%p]\n",
 	      __func__, req->conn);
 
-	tfw_hpack_enc_release(&ctx->hpack, resp->flags);
-
 	tfw_http_msg_free((TfwHttpMsg *)resp);
 err:
 	tfw_http_resp_build_error(req);
@@ -1219,12 +1225,12 @@ tfw_http_prep_err_resp(TfwHttpReq *req, int status, TfwStr *msg)
  * pairing for pipelined requests is violated.
  */
 static void
-tfw_h2_send_err_resp(TfwHttpReq *req, int status, unsigned int stream_id)
+tfw_h2_send_err_resp(TfwHttpReq *req, int status, bool close_after_send)
 {
 	TfwStr msg = MAX_PREDEF_RESP;
 
 	tfw_http_prep_err_resp(req, status, &msg);
-	tfw_h2_send_resp(req, &msg, status, stream_id);
+	tfw_h2_send_resp(req, &msg, status, close_after_send);
 }
 
 /*
@@ -1485,7 +1491,7 @@ tfw_http_send_err_resp(TfwHttpReq *req, int status, const char *reason)
 				   TFW_NO_PORT, status);
 
 	if (TFW_MSG_H2(req))
-		tfw_h2_send_err_resp(req, status, 0);
+		tfw_h2_send_err_resp(req, status, false);
 	else
 		tfw_h1_send_err_resp(req, status);
 }
@@ -1494,7 +1500,7 @@ static void
 tfw_http_send_resp(TfwHttpReq *req, TfwStr *msg, int status)
 {
 	if (TFW_MSG_H2(req)) {
-		tfw_h2_send_resp(req, msg, status, 0);
+		tfw_h2_send_resp(req, msg, status, false);
 	} else {
 		TfwCliConn *cli_conn = (TfwCliConn *)req->conn;
 
@@ -4101,7 +4107,7 @@ tfw_http_adjust_resp(TfwHttpResp *resp)
 			return r;
 	}
 
-	r = tfw_http_sess_resp_process(resp, false, 0);
+	r = tfw_http_sess_resp_process(resp, false);
 	if (r < 0)
 		return r;
 
@@ -4334,7 +4340,7 @@ tfw_h2_hdr_map(TfwHttpResp *resp, const TfwStr *hdr, unsigned int id)
  * transformation.
  */
 static int
-tfw_h2_add_hdr_via(TfwHttpResp *resp, unsigned int stream_id)
+tfw_h2_add_hdr_via(TfwHttpResp *resp)
 {
 	int r;
 	TfwGlobal *g_vhost = tfw_vhost_get_global();
@@ -4354,7 +4360,7 @@ tfw_h2_add_hdr_via(TfwHttpResp *resp, unsigned int stream_id)
 
 	via.hpack_idx = 60;
 
-	r = tfw_hpack_encode(resp, &via, true, true, stream_id);
+	r = tfw_hpack_encode(resp, &via, true, true);
 	if (unlikely(r))
 		T_ERR("HTTP/2: unable to add 'via' header (resp=[%p])\n", resp);
 	else
@@ -4367,7 +4373,7 @@ tfw_h2_add_hdr_via(TfwHttpResp *resp, unsigned int stream_id)
  * transformation and for building response from cache.
  */
 int
-tfw_h2_add_hdr_date(TfwHttpResp *resp, bool cache, unsigned int stream_id)
+tfw_h2_add_hdr_date(TfwHttpResp *resp, bool cache)
 {
 	int r;
 	char *s_date = *this_cpu_ptr(&g_buf);
@@ -4384,7 +4390,7 @@ tfw_h2_add_hdr_date(TfwHttpResp *resp, bool cache, unsigned int stream_id)
 
 	hdr.hpack_idx = 33;
 
-	r = tfw_hpack_encode(resp, &hdr, !cache, !cache, stream_id);
+	r = tfw_hpack_encode(resp, &hdr, !cache, !cache);
 	if (unlikely(r))
 		T_ERR("HTTP/2: unable to add 'date' header to response"
 			" [%p]\n", resp);
@@ -4398,7 +4404,7 @@ tfw_h2_add_hdr_date(TfwHttpResp *resp, bool cache, unsigned int stream_id)
  * Add 'Content-Length:' header field to an HTTP message.
  */
 static int
-tfw_h2_add_hdr_clen(TfwHttpResp *resp, unsigned int stream_id)
+tfw_h2_add_hdr_clen(TfwHttpResp *resp)
 {
 	int r;
 	char* buf = *this_cpu_ptr(&g_buf);
@@ -4408,7 +4414,7 @@ tfw_h2_add_hdr_clen(TfwHttpResp *resp, unsigned int stream_id)
 
 	r = tfw_h2_msg_hdr_add(resp, "content-length",
 			       SLEN("content-length"), buf,
-			       cl_valsize, 28, stream_id);
+			       cl_valsize, 28);
 
 	if (unlikely(r))
 		T_ERR("%s: unable to add 'content-length' header (resp=[%p])\n",
@@ -4426,7 +4432,7 @@ tfw_h2_add_hdr_clen(TfwHttpResp *resp, unsigned int stream_id)
  * from transfer encoding.
  */
 static int
-tfw_h2_add_hdr_cenc(TfwHttpResp *resp, TfwStr *value, unsigned int stream_id)
+tfw_h2_add_hdr_cenc(TfwHttpResp *resp, TfwStr *value)
 {
 	int r;
 	TfwStr name = { .data = "content-encoding",
@@ -4441,7 +4447,7 @@ tfw_h2_add_hdr_cenc(TfwHttpResp *resp, TfwStr *value, unsigned int stream_id)
 		.hpack_idx = 26
 	};
 
-	r = tfw_hpack_encode(resp, &hdr, true, true, stream_id);
+	r = tfw_hpack_encode(resp, &hdr, true, true);
 
 	if (unlikely(r))
 		goto err;
@@ -4511,7 +4517,7 @@ tfw_http_resp_copy_encodings(TfwHttpResp *resp, TfwStr* dst, size_t max_len)
  * In case if response is stale, we should pass it with a warning.
  */
 int
-tfw_h2_set_stale_warn(TfwHttpResp *resp, unsigned int stream_id)
+tfw_h2_set_stale_warn(TfwHttpResp *resp)
 {
 	TfwStr wh = {
 		.chunks = (TfwStr []){
@@ -4522,7 +4528,7 @@ tfw_h2_set_stale_warn(TfwHttpResp *resp, unsigned int stream_id)
 		.nchunks = 2
 	};
 
-	return tfw_hpack_encode(resp, &wh, false, false, stream_id);
+	return tfw_hpack_encode(resp, &wh, false, false);
 }
 
 /*
@@ -4663,7 +4669,7 @@ tfw_h2_hdr_size(unsigned long n_len, unsigned long v_len,
 
 int
 tfw_h2_resp_add_loc_hdrs(TfwHttpResp *resp, const TfwHdrMods *h_mods,
-			 bool cache, unsigned int stream_id)
+			 bool cache)
 {
 	unsigned int i;
 	TfwHttpHdrTbl *ht = resp->h_tbl;
@@ -4687,8 +4693,7 @@ tfw_h2_resp_add_loc_hdrs(TfwHttpResp *resp, const TfwHdrMods *h_mods,
 			continue;
 		}
 
-		r = tfw_hpack_encode(resp, desc->hdr, !cache, !cache,
-				     stream_id);
+		r = tfw_hpack_encode(resp, desc->hdr, !cache, !cache);
 		if (unlikely(r))
 			return r;
 	}
@@ -4730,8 +4735,7 @@ tfw_h2_hdr_sub(unsigned short hid, const TfwStr *hdr, const TfwHdrMods *h_mods)
 }
 
 static int
-tfw_h2_hpack_encode_headers(TfwHttpResp *resp, const TfwHdrMods *h_mods,
-			    unsigned int stream_id)
+tfw_h2_hpack_encode_headers(TfwHttpResp *resp, const TfwHdrMods *h_mods)
 {
 	int r;
 	unsigned int i;
@@ -4778,7 +4782,7 @@ tfw_h2_hpack_encode_headers(TfwHttpResp *resp, const TfwHdrMods *h_mods,
 		if (hid == TFW_HTTP_HDR_SERVER)
 			continue;
 
-		r = tfw_hpack_transform(resp, tgt, stream_id);
+		r = tfw_hpack_transform(resp, tgt);
 		if (unlikely(r))
 			return r;
 	}
@@ -4796,8 +4800,7 @@ tfw_h2_hpack_encode_headers(TfwHttpResp *resp, const TfwHdrMods *h_mods,
  * processing thus no chunked body allowed, only plain TfwStr is accepted there.
  */
 static int
-tfw_h2_append_predefined_body(TfwHttpResp *resp, unsigned int stream_id,
-			      const TfwStr *body)
+tfw_h2_append_predefined_body(TfwHttpResp *resp, const TfwStr *body)
 {
 	TfwHttpTransIter *mit = &resp->mit;
 	TfwMsgIter *it = &mit->iter;
@@ -4842,10 +4845,6 @@ tfw_h2_append_predefined_body(TfwHttpResp *resp, unsigned int stream_id,
 		skb_fill_page_desc(it->skb, it->frag, page, 0, copy);
 		ss_skb_adjust_data_len(it->skb, copy);
 
-		BUG_ON(!stream_id);
-		skb_set_tfw_flags(it->skb, SS_F_HTTT2_FRAME_DATA);
-		skb_set_tfw_cb(it->skb, stream_id);
-
 		if (it->frag + 1 == MAX_SKB_FRAGS
 		    && (r = tfw_msg_iter_append_skb(it)))
 		{
@@ -4856,54 +4855,51 @@ tfw_h2_append_predefined_body(TfwHttpResp *resp, unsigned int stream_id,
 	return 0;
 }
 
-/**
- * Frame forwarded response.
- */
-static int
-tfw_h2_frame_fwd_resp(TfwHttpResp *resp, unsigned int stream_id,
-		     unsigned long h_len)
+int
+tfw_http_on_send_resp(void *conn, struct sk_buff **skb_head)
 {
-	unsigned long b_len = TFW_HTTP_RESP_CUT_BODY_SZ(resp);
-	TfwMsgIter iter = {.frag = -1, .skb_head = resp->msg.skb_head};
-	int r = 0;
-
-	r = tfw_h2_stream_init_for_xmit(resp->req, h_len, b_len);
-	if (unlikely(r))
-		return r;
-
-	if (test_bit(TFW_HTTP_B_CHUNKED, resp->flags)) {
-		r = tfw_http_msg_cutoff_body_chunks(resp);
-		if (unlikely(r))
-			return r;
-	}
+	TfwH2Ctx *ctx = tfw_h2_context_unsafe((TfwConn *)conn);
+	struct tfw_skb_cb *tfw_cb = TFW_SKB_CB(*skb_head);
+	TfwStream *stream;
 
-	if (b_len) {
-		if (test_bit(TFW_HTTP_B_CHUNKED, resp->flags))
-			iter.skb = resp->body_start_skb;
-		else
-			iter.skb = resp->body.skb;
-		tfw_msg_iter_set_skb_priv(&iter, stream_id,
-					  SS_F_HTTT2_FRAME_DATA);
-	}
+	stream = tfw_h2_find_not_closed_stream(ctx, tfw_cb->stream_id, false);
+	/*
+	 * Very unlikely case. We check that stream is active, before
+	 * calling ss_send, but there is a very small chance, that
+	 * stream was canceled by RST STREAM from the client
+	 * before ss_do_send was called.
+	 */
+	if (unlikely(!stream))
+		return -EPIPE;
+
+	BUG_ON(stream->xmit.skb_head);
+	stream->xmit.resp = (TfwHttpResp *)tfw_cb->opaque_data;
+	if (test_bit(TFW_HTTP_B_CLOSE_ERROR_RESPONSE, stream->xmit.resp->flags))
+		ctx->error = stream;
+	swap(stream->xmit.skb_head, *skb_head);
+	sock_set_flag(((TfwConn *)conn)->sk, SOCK_TEMPESTA_HAS_DATA);
+	if (!stream->xmit.is_blocked)
+		tfw_h2_sched_activate_stream(&ctx->sched, stream);
 
-	return r;
+	return 0;
 }
 
 /**
  * Frame response generated locally.
  */
 int
-tfw_h2_frame_local_resp(TfwHttpResp *resp, unsigned int stream_id,
-		        unsigned long h_len, const TfwStr *body)
+tfw_h2_frame_local_resp(TfwHttpResp *resp, unsigned long h_len,
+			const TfwStr *body)
 {
 	unsigned long b_len = body ? body->len : 0;
 	int r;
 
-	r = tfw_h2_stream_init_for_xmit(resp->req, h_len, b_len);
+	r = tfw_h2_append_predefined_body(resp, body);
 	if (unlikely(r))
 		return r;
 
-	return tfw_h2_append_predefined_body(resp, stream_id, body);
+	return tfw_h2_stream_init_for_xmit(resp, HTTP2_RELEASE_RESPONSE,
+					   h_len, b_len);
 }
 
 static void
@@ -4997,6 +4993,8 @@ tfw_h2_error_resp(TfwHttpReq *req, int status, bool reply, ErrorType type,
 {
 	TfwStream *stream;
 	TfwH2Ctx *ctx = tfw_h2_context_unsafe(req->conn);
+	bool close_after_send = (type == TFW_ERROR_TYPE_ATTACK ||
+		type == TFW_ERROR_TYPE_BAD);
 
 	/*
 	 * block_action attack/error drop - Tempesta FW must block message
@@ -5036,9 +5034,8 @@ tfw_h2_error_resp(TfwHttpReq *req, int status, bool reply, ErrorType type,
 	 * and GOAWAY frame should be sent (RFC 7540 section 6.8) after
 	 * error response.
 	 */
-	tfw_h2_send_err_resp(req, status, stream->id);
-	if (type == TFW_ERROR_TYPE_ATTACK
-	    || type == TFW_ERROR_TYPE_BAD) {
+	tfw_h2_send_err_resp(req, status, close_after_send);
+	if (close_after_send) {
 		tfw_h2_conn_terminate_close(ctx, err_code, !on_req_recv_event,
 					    type == TFW_ERROR_TYPE_ATTACK);
 	} else {
@@ -5051,8 +5048,7 @@ tfw_h2_error_resp(TfwHttpReq *req, int status, bool reply, ErrorType type,
 	goto out;
 
 skip_stream:
-	if (type == TFW_ERROR_TYPE_ATTACK
-	    || type == TFW_ERROR_TYPE_BAD) {
+	if (close_after_send) {
 		tfw_h2_conn_terminate_close(ctx, err_code, !on_req_recv_event,
 					    type == TFW_ERROR_TYPE_ATTACK);
 	}
@@ -5266,13 +5262,11 @@ __tfw_h2_resp_cleanup(TfwHttpRespCleanup *cleanup)
  * Major browsers and curl ignore that RFC requirement an work well. But
  * that is definitely an RFC violation and implementation specific behaviour.
  */
-static void
-tfw_h2_resp_adjust_fwd(TfwHttpResp *resp)
+int
+tfw_h2_resp_encode_headers(TfwHttpResp *resp)
 {
 	int r;
-	unsigned int stream_id;
 	TfwHttpReq *req = resp->req;
-	TfwH2Ctx *ctx = tfw_h2_context_unsafe(req->conn);
 	TfwHttpTransIter *mit = &resp->mit;
 	TfwHttpRespCleanup cleanup = {};
 	TfwStr codings = {.data = *this_cpu_ptr(&g_te_buf), .len = 0};
@@ -5280,10 +5274,6 @@ tfw_h2_resp_adjust_fwd(TfwHttpResp *resp)
 							  req->vhost,
 							  TFW_VHOST_HDRMOD_RESP);
 
-	stream_id = tfw_h2_req_stream_id(req);
-	if (unlikely(!stream_id))
-		goto out;
-
 	/*
 	 * Accordingly to RFC 9113 8.2.2 connection-specific headers can't
 	 * be used in HTTP/2.
@@ -5315,7 +5305,8 @@ tfw_h2_resp_adjust_fwd(TfwHttpResp *resp)
 	WARN_ON_ONCE(mit->acc_len);
 	tfw_h2_msg_transform_setup(mit, resp->msg.skb_head, true);
 
-	if (tfw_h2_msg_cutoff_headers(resp, &cleanup))
+	r = tfw_h2_msg_cutoff_headers(resp, &cleanup);
+	if (unlikely(r))
 		goto clean;
 
 	/*
@@ -5326,12 +5317,11 @@ tfw_h2_resp_adjust_fwd(TfwHttpResp *resp)
 	if (unlikely(r))
 		goto clean;
 
-	r = tfw_h2_resp_status_write(resp, resp->status, true, false,
-				     stream_id);
+	r = tfw_h2_resp_status_write(resp, resp->status, true, false);
 	 if (unlikely(r))
 		goto clean;
 
-	r = tfw_h2_hpack_encode_headers(resp, h_mods, stream_id);
+	r = tfw_h2_hpack_encode_headers(resp, h_mods);
 	if (unlikely(r))
 		goto clean;
 
@@ -5341,42 +5331,38 @@ tfw_h2_resp_adjust_fwd(TfwHttpResp *resp)
 	 * processed above and which have non-empty value (i.e. configured
 	 * not for deletion).
 	 */
-	r = tfw_http_sess_resp_process(resp, false, stream_id);
+	r = tfw_http_sess_resp_process(resp, false);
 	if (unlikely(r))
 		goto clean;
 
-	r = tfw_h2_add_hdr_via(resp, stream_id);
+	r = tfw_h2_add_hdr_via(resp);
 	if (unlikely(r))
 		goto clean;
 
 	if (!test_bit(TFW_HTTP_B_HDR_DATE, resp->flags)) {
-		r = tfw_h2_add_hdr_date(resp, false, stream_id);
+		r = tfw_h2_add_hdr_date(resp, false);
 		if (unlikely(r))
 			goto clean;
 	}
 
 	if (test_bit(TFW_HTTP_B_CHUNKED, resp->flags)) {
-		if (unlikely(tfw_h2_add_hdr_clen(resp, stream_id)))
+		if (unlikely(tfw_h2_add_hdr_clen(resp)))
 			goto clean;
 	}
 
 	if (test_bit(TFW_HTTP_B_TE_EXTRA, resp->flags)) {
-		r = tfw_h2_add_hdr_cenc(resp, &codings, stream_id);
+		r = tfw_h2_add_hdr_cenc(resp, &codings);
 		if (unlikely(r))
 			goto clean;
 
 		TFW_STR_INIT(&codings);
 	}
 
-	r = TFW_H2_MSG_HDR_ADD(resp, "server", TFW_SERVER, 54, stream_id);
+	r = TFW_H2_MSG_HDR_ADD(resp, "server", TFW_SERVER, 54);
 	if (unlikely(r))
 		goto clean;
 
-	r = tfw_h2_resp_add_loc_hdrs(resp, h_mods, false, stream_id);
-	if (unlikely(r))
-		goto clean;
-
-	r = tfw_h2_frame_fwd_resp(resp, stream_id, mit->acc_len);
+	r = tfw_h2_resp_add_loc_hdrs(resp, h_mods, false);
 	if (unlikely(r))
 		goto clean;
 
@@ -5384,26 +5370,31 @@ tfw_h2_resp_adjust_fwd(TfwHttpResp *resp)
 	       req, resp);
 	SS_SKB_QUEUE_DUMP(&resp->msg.skb_head);
 
-	tfw_h2_req_unlink_stream(req);
-	tfw_h2_resp_fwd(resp);
-
 	__tfw_h2_resp_cleanup(&cleanup);
+	return 0;
 
-	return;
 clean:
 	__tfw_h2_resp_cleanup(&cleanup);
-	tfw_http_conn_msg_free((TfwHttpMsg *)resp);
-	if (!(tfw_blk_flags & TFW_BLK_ERR_NOLOG))
-		T_WARN_ADDR_STATUS("response dropped: processing error",
-				   &req->conn->peer->addr,
-				   TFW_NO_PORT, 500);
-	tfw_h2_send_err_resp(req, 500, stream_id);
-	tfw_hpack_enc_release(&ctx->hpack, resp->flags);
-	TFW_INC_STAT_BH(serv.msgs_otherr);
+	return r;
+}
 
-	return;
-out:
-	tfw_http_resp_pair_free(req);
+static void
+tfw_h2_resp_adjust_fwd(TfwHttpResp *resp)
+{
+	TfwHttpReq *req = resp->req;
+	int r;
+
+	/*
+	 * This function can be failed only if stream is
+	 * already closed and deleted.
+	 */
+	r = tfw_h2_stream_init_for_xmit(resp, HTTP2_ENCODE_HEADERS, 0, 0);
+	if (unlikely(r)) {
+		tfw_http_resp_pair_free(req);
+	} else {
+		tfw_h2_req_unlink_stream(req);
+		tfw_h2_resp_fwd(resp);
+	}
 }
 
 /**
diff --git a/fw/http.h b/fw/http.h
index de62d1999..4f69be64d 100644
--- a/fw/http.h
+++ b/fw/http.h
@@ -741,13 +741,13 @@ int tfw_http_expand_hbh(TfwHttpResp *resp, unsigned short status);
 int tfw_http_expand_hdr_via(TfwHttpResp *resp);
 void tfw_h2_resp_fwd(TfwHttpResp *resp);
 int tfw_h2_hdr_map(TfwHttpResp *resp, const TfwStr *hdr, unsigned int id);
-int tfw_h2_add_hdr_date(TfwHttpResp *resp, bool cache, unsigned int stream_id);
-int tfw_h2_set_stale_warn(TfwHttpResp *resp, unsigned int stream_id);
+int tfw_h2_add_hdr_date(TfwHttpResp *resp, bool cache);
+int tfw_h2_set_stale_warn(TfwHttpResp *resp);
 int tfw_h2_resp_add_loc_hdrs(TfwHttpResp *resp, const TfwHdrMods *h_mods,
-			     bool cache, unsigned int stream_id);
+			     bool cache);
 int tfw_h2_resp_status_write(TfwHttpResp *resp, unsigned short status,
-			     bool use_pool, bool cache,
-			     unsigned int stream_id);
+			     bool use_pool, bool cache);
+int tfw_h2_resp_encode_headers(TfwHttpResp *resp);
 /*
  * Functions to send an HTTP error response to a client.
  */
@@ -756,6 +756,7 @@ int tfw_http_prep_redir(TfwHttpResp *resp, unsigned short status,
 int tfw_http_prep_304(TfwHttpReq *req, struct sk_buff **skb_head,
 		      TfwMsgIter *it);
 void tfw_http_conn_msg_free(TfwHttpMsg *hm);
+void tfw_http_resp_pair_free_and_put_conn(void *opaque_data);
 void tfw_http_send_err_resp(TfwHttpReq *req, int status, const char *reason);
 
 /* Helper functions */
@@ -764,12 +765,13 @@ unsigned long tfw_http_hdr_split(TfwStr *hdr, TfwStr *name_out, TfwStr *val_out,
 				 bool inplace);
 unsigned long tfw_h2_hdr_size(unsigned long n_len, unsigned long v_len,
 			      unsigned short st_index);
-int tfw_h2_frame_local_resp(TfwHttpResp *resp, unsigned int stream_id,
-			    unsigned long h_len, const TfwStr *body);
+int tfw_h2_frame_local_resp(TfwHttpResp *resp, unsigned long h_len,
+			    const TfwStr *body);
 int tfw_http_resp_copy_encodings(TfwHttpResp *resp, TfwStr* dst,
 				 size_t max_len);
 void tfw_http_extract_request_authority(TfwHttpReq *req);
 bool tfw_http_mark_is_in_whitlist(unsigned int mark);
 char *tfw_http_resp_status_line(int status, size_t *len);
+int tfw_http_on_send_resp(void *conn, struct sk_buff **skb_head);
 
 #endif /* __TFW_HTTP_H__ */
diff --git a/fw/http2.c b/fw/http2.c
new file mode 100644
index 000000000..8188b36b2
--- /dev/null
+++ b/fw/http2.c
@@ -0,0 +1,556 @@
+/**
+ *		Tempesta FW
+ *
+ * Copyright (C) 2024 Tempesta Technologies, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+#undef DEBUG
+#if DBG_HTTP2 > 0
+#define DEBUG DBG_HTTP2
+#endif
+
+#include "connection.h"
+#include "http.h"
+#include "http2.h"
+#include "http_frame.h"
+#include "http_msg.h"
+
+#define TFW_MAX_CLOSED_STREAMS          5
+
+/**
+ * Usually client firstly send SETTINGS frame to a server, so:
+ * - we don't have many streams to iterate over in this function
+ *   (usually we have no streams at all).
+ * - typically there is only one SETTINGS_INITIAL_WINDOW_SIZE
+ *   frame is sent from a client side.
+ */
+static void
+tfw_h2_apply_wnd_sz_change(TfwH2Ctx *ctx, long int delta)
+{
+	TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2);
+	TfwStream *stream, *next;
+
+	/*
+	 * Order is no matter, use default funtion from the Linux kernel.
+	 * According to RFC 9113 6.9.2
+	 * When the value of SETTINGS_INITIAL_WINDOW_SIZE changes, a receiver
+	 * MUST adjust the size of all stream flow-control windows that it
+	 * maintains by the difference between the new value and the old value.
+	 * A change to SETTINGS_INITIAL_WINDOW_SIZE can cause the available
+	 * space in a flow-control window to become negative.
+	 */
+	rbtree_postorder_for_each_entry_safe(stream, next,
+					     &ctx->sched.streams, node) {
+		TfwStreamState state = tfw_h2_get_stream_state(stream);
+		if (state == HTTP2_STREAM_OPENED ||
+		    state == HTTP2_STREAM_REM_HALF_CLOSED) {
+			stream->rem_wnd += delta;
+			tfw_h2_stream_try_unblock(&ctx->sched, stream);
+			if (stream->rem_wnd > 0) {
+				sock_set_flag(((TfwConn *)conn)->sk,
+					      SOCK_TEMPESTA_HAS_DATA);
+			}
+		}
+	}
+}
+
+static void
+tfw_h2_apply_settings_entry(TfwH2Ctx *ctx, unsigned short id,
+			    unsigned int val)
+{
+	TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2);
+	TfwSettings *dest = &ctx->rsettings;
+	long int delta;
+
+	switch (id) {
+	case HTTP2_SETTINGS_TABLE_SIZE:
+		assert_spin_locked(&((TfwConn *)conn)->sk->sk_lock.slock);
+		dest->hdr_tbl_sz = min_t(unsigned int,
+					 val, HPACK_ENC_TABLE_MAX_SIZE);
+		tfw_hpack_set_rbuf_size(&ctx->hpack.enc_tbl, dest->hdr_tbl_sz);
+		break;
+
+	case HTTP2_SETTINGS_ENABLE_PUSH:
+		BUG_ON(val > 1);
+		dest->push = val;
+		break;
+
+	case HTTP2_SETTINGS_MAX_STREAMS:
+		dest->max_streams = val;
+		break;
+
+	case HTTP2_SETTINGS_INIT_WND_SIZE:
+		BUG_ON(val > MAX_WND_SIZE);
+		delta = (long int)val - (long int)dest->wnd_sz;
+		tfw_h2_apply_wnd_sz_change(ctx, delta);
+		dest->wnd_sz = val;
+		break;
+
+	case HTTP2_SETTINGS_MAX_FRAME_SIZE:
+		BUG_ON(val < FRAME_DEF_LENGTH || val > FRAME_MAX_LENGTH);
+		dest->max_frame_sz = val;
+		break;
+
+	case HTTP2_SETTINGS_MAX_HDR_LIST_SIZE:
+		dest->max_lhdr_sz = val;
+		break;
+
+	default:
+		/*
+		 * We should silently ignore unknown identifiers (see
+		 * RFC 9113 section 6.5.2)
+		 */
+		break;
+	}
+}
+
+int
+tfw_h2_check_settings_entry(TfwH2Ctx *ctx, unsigned short id, unsigned int val)
+{
+	TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2);
+
+	assert_spin_locked(&((TfwConn *)conn)->sk->sk_lock.slock);
+
+	switch (id) {
+	case HTTP2_SETTINGS_TABLE_SIZE:
+		break;
+
+	case HTTP2_SETTINGS_ENABLE_PUSH:
+		if (val > 1)
+			return -EINVAL;
+		break;
+
+	case HTTP2_SETTINGS_MAX_STREAMS:
+		break;
+
+	case HTTP2_SETTINGS_INIT_WND_SIZE:
+		if (val > MAX_WND_SIZE)
+			return -EINVAL;
+		break;
+
+	case HTTP2_SETTINGS_MAX_FRAME_SIZE:
+		if (val < FRAME_DEF_LENGTH || val > FRAME_MAX_LENGTH)
+			return -EINVAL;
+		break;
+
+	case HTTP2_SETTINGS_MAX_HDR_LIST_SIZE:
+		break;
+
+	default:
+		/*
+		 * We should silently ignore unknown identifiers (see
+		 * RFC 9113 section 6.5.2)
+		 */
+		break;
+	}
+
+	return 0;
+}
+
+void
+tfw_h2_save_settings_entry(TfwH2Ctx *ctx, unsigned short id, unsigned int val)
+{
+	TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2);
+
+	assert_spin_locked(&((TfwConn *)conn)->sk->sk_lock.slock);
+
+	if (id > 0 && id < _HTTP2_SETTINGS_MAX) {
+		ctx->new_settings[id - 1] = val;
+		__set_bit(id, ctx->settings_to_apply);
+		__set_bit(HTTP2_SETTINGS_NEED_TO_APPLY,
+			  ctx->settings_to_apply);
+	}
+}
+
+void
+tfw_h2_apply_new_settings(TfwH2Ctx *ctx)
+{
+	TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2);
+	unsigned int id;
+
+	assert_spin_locked(&((TfwConn *)conn)->sk->sk_lock.slock);
+
+	for (id = HTTP2_SETTINGS_TABLE_SIZE; id < _HTTP2_SETTINGS_MAX; id++) {
+		if (test_bit(id, ctx->settings_to_apply)) {
+			unsigned int val = ctx->new_settings[id - 1];
+			tfw_h2_apply_settings_entry(ctx, id, val);
+		}
+	}
+	clear_bit(HTTP2_SETTINGS_NEED_TO_APPLY, ctx->settings_to_apply);
+}
+
+int
+tfw_h2_init(void)
+{
+	return tfw_h2_stream_cache_create();
+}
+
+void
+tfw_h2_cleanup(void)
+{
+	tfw_h2_stream_cache_destroy();
+}
+
+int
+tfw_h2_context_init(TfwH2Ctx *ctx)
+{
+	TfwStreamQueue *closed_streams = &ctx->closed_streams;
+	TfwStreamQueue *idle_streams = &ctx->idle_streams;
+	TfwSettings *lset = &ctx->lsettings;
+	TfwSettings *rset = &ctx->rsettings;
+
+	bzero_fast(ctx, sizeof(*ctx));
+
+	ctx->state = HTTP2_RECV_CLI_START_SEQ;
+	ctx->loc_wnd = DEF_WND_SIZE;
+	ctx->rem_wnd = DEF_WND_SIZE;
+
+	spin_lock_init(&ctx->lock);
+	INIT_LIST_HEAD(&closed_streams->list);
+	INIT_LIST_HEAD(&idle_streams->list);
+
+	tfw_h2_init_stream_sched(&ctx->sched);
+
+	lset->hdr_tbl_sz = rset->hdr_tbl_sz = HPACK_TABLE_DEF_SIZE;
+	lset->push = rset->push = 1;
+	lset->max_streams = tfw_cli_max_concurrent_streams;
+	rset->max_streams = 0xffffffff;
+	lset->max_frame_sz = rset->max_frame_sz = FRAME_DEF_LENGTH;
+	lset->max_lhdr_sz = max_header_list_size ?
+		max_header_list_size : UINT_MAX;
+	rset->max_lhdr_sz = UINT_MAX;
+
+	lset->wnd_sz = DEF_WND_SIZE;
+	rset->wnd_sz = DEF_WND_SIZE;
+
+	return tfw_hpack_init(&ctx->hpack, HPACK_TABLE_DEF_SIZE);
+}
+ALLOW_ERROR_INJECTION(tfw_h2_context_init, ERRNO);
+
+void
+tfw_h2_context_clear(TfwH2Ctx *ctx)
+{
+	WARN_ON_ONCE(ctx->streams_num);
+	/*
+	 * Free POSTPONED SKBs. This is necessary when h2 context has
+	 * postponed frames and connection closing initiated.
+	 */
+	ss_skb_queue_purge(&ctx->skb_head);
+	tfw_hpack_clean(&ctx->hpack);
+}
+
+void
+tfw_h2_conn_terminate_close(TfwH2Ctx *ctx, TfwH2Err err_code, bool close,
+			    bool attack)
+{
+	TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2);
+
+	if (tfw_h2_send_goaway(ctx, err_code, attack) && close) {
+		if (attack)
+			tfw_connection_close((TfwConn *)conn, true);
+		else
+			tfw_connection_shutdown((TfwConn *)conn, true);
+	}
+}
+
+/**
+ * According to RFC 9113 section 5.1.1:
+ * The first use of a new stream identifier implicitly closes all
+ * streams in the "idle" state that might have been initiated by that
+ * peer with a lower-valued stream identifier.
+ */
+void
+tfw_h2_remove_idle_streams(TfwH2Ctx *ctx, unsigned int id)
+{
+	TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2);
+	TfwStream *stream, *tmp;
+
+	/*
+	 * We add and remove streams from idle queue under
+	 * socket lock.
+	 */
+	assert_spin_locked(&((TfwConn *)conn)->sk->sk_lock.slock);
+
+	list_for_each_entry_safe_reverse(stream, tmp, &ctx->idle_streams.list,
+					 hcl_node)
+	{
+		if (id <= stream->id)
+			break;
+
+		tfw_h2_stream_del_from_queue_nolock(stream);
+		tfw_h2_set_stream_state(stream, HTTP2_STREAM_CLOSED);
+		tfw_h2_stream_add_closed(ctx, stream);
+	}
+}
+
+void
+tfw_h2_conn_streams_cleanup(TfwH2Ctx *ctx)
+{
+	TfwStream *cur, *next;
+	TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2);
+	TfwStreamSched *sched = &ctx->sched;
+
+	WARN_ON_ONCE(((TfwConn *)conn)->stream.msg);
+
+	T_DBG3("%s: ctx [%p] conn %p sched %p\n", __func__, ctx, conn, sched);
+
+	tfw_h2_remove_idle_streams(ctx, UINT_MAX);
+
+        rbtree_postorder_for_each_entry_safe(cur, next, &sched->streams, node) {
+		tfw_h2_stream_purge_all_and_free_response(cur);
+		tfw_h2_stream_unlink_lock(ctx, cur);
+
+		/* The streams tree is about to be destroyed and
+		 * we don't want to trigger rebalancing.
+		 * No further actions regarding streams dependencies/prio
+		 * is required at this stage.
+		 */
+		tfw_h2_delete_stream(cur);
+		--ctx->streams_num;
+	}
+	sched->streams = RB_ROOT;
+}
+
+void
+tfw_h2_current_stream_remove(TfwH2Ctx *ctx)
+{
+	T_DBG3("%s: ctx [%p] ctx->cur_stream %p\n", __func__,
+	       ctx, ctx->cur_stream);
+	tfw_h2_stream_unlink_lock(ctx, ctx->cur_stream);
+	tfw_h2_stream_clean(ctx, ctx->cur_stream);
+	ctx->cur_stream = NULL;
+}
+
+/*
+ * Clean the queue of closed streams if its size has exceeded a certain
+ * value.
+ */
+void
+tfw_h2_closed_streams_shrink(TfwH2Ctx *ctx)
+{
+	TfwStream *cur;
+	TfwStreamQueue *closed_streams = &ctx->closed_streams;
+
+	T_DBG3("%s: ctx [%p] closed streams num %lu\n", __func__, ctx,
+	       closed_streams->num);
+
+	while (1) {
+		spin_lock(&ctx->lock);
+
+		if (closed_streams->num <= TFW_MAX_CLOSED_STREAMS) {
+			spin_unlock(&ctx->lock);
+			break;
+		}
+
+		BUG_ON(list_empty(&closed_streams->list));
+		cur = list_first_entry(&closed_streams->list, TfwStream,
+				       hcl_node);
+		tfw_h2_stream_unlink_nolock(ctx, cur);
+
+		spin_unlock(&ctx->lock);
+
+		T_DBG3("%s: ctx [%p] cur stream [%p]\n", __func__, ctx, cur);
+
+		tfw_h2_stream_clean(ctx, cur);
+	}
+}
+
+void
+tfw_h2_check_current_stream_is_closed(TfwH2Ctx *ctx)
+{
+	BUG_ON(!ctx->cur_stream);
+
+	T_DBG3("%s: strm [%p] id %u state %d(%s), streams_num %lu\n",
+	       __func__, ctx->cur_stream, ctx->cur_stream->id,
+	       tfw_h2_get_stream_state(ctx->cur_stream),
+	       __h2_strm_st_n(ctx->cur_stream), ctx->streams_num);
+
+	if (tfw_h2_stream_is_closed(ctx->cur_stream))
+		tfw_h2_current_stream_remove(ctx);
+}
+
+TfwStream *
+tfw_h2_find_not_closed_stream(TfwH2Ctx *ctx, unsigned int id, bool recv)
+{
+	TfwStream *stream;
+
+	stream = tfw_h2_find_stream(&ctx->sched, id);
+	return stream && !tfw_h2_stream_is_closed(stream) ? stream : NULL;
+}
+
+/*
+ * Get stream ID for upper layer to create frames info.
+ */
+unsigned int
+tfw_h2_req_stream_id(TfwHttpReq *req)
+{
+	unsigned int id = 0;
+	TfwH2Ctx *ctx = tfw_h2_context_unsafe(req->conn);
+
+	spin_lock(&ctx->lock);
+
+	if (req->stream)
+		id = req->stream->id;
+
+	spin_unlock(&ctx->lock);
+
+	return id;
+}
+
+/*
+ * Unlink request from corresponding stream (if linked).
+ */
+void
+tfw_h2_req_unlink_stream(TfwHttpReq *req)
+{
+	TfwStream *stream;
+	TfwH2Ctx *ctx = tfw_h2_context_unsafe(req->conn);
+
+	spin_lock(&ctx->lock);
+
+	stream = req->stream;
+	if (!stream) {
+		spin_unlock(&ctx->lock);
+		return;
+	}
+
+	req->stream = NULL;
+	stream->msg = NULL;
+
+	spin_unlock(&ctx->lock);
+}
+
+/*
+ * Unlink request from corresponding stream (if linked),
+ * send RST STREAM and add stream to closed queue.
+ */
+void
+tfw_h2_req_unlink_stream_with_rst(TfwHttpReq *req)
+{
+	TfwStreamFsmRes r;
+	TfwStream *stream;
+	TfwH2Ctx *ctx = tfw_h2_context_unsafe(req->conn);
+
+	spin_lock(&ctx->lock);
+
+	stream = req->stream;
+	if (!stream) {
+		spin_unlock(&ctx->lock);
+		return;
+	}
+
+	req->stream = NULL;
+	stream->msg = NULL;
+
+	r = tfw_h2_stream_fsm_ignore_err(ctx, stream, HTTP2_RST_STREAM, 0);
+	WARN_ON_ONCE(r != STREAM_FSM_RES_OK && r != STREAM_FSM_RES_IGNORE);
+
+	tfw_h2_stream_add_to_queue_nolock(&ctx->closed_streams, stream);
+
+	spin_unlock(&ctx->lock);
+}
+
+int
+tfw_h2_stream_xmit_prepare_resp(TfwStream *stream)
+{
+	TfwHttpResp *resp = stream->xmit.resp;
+	unsigned char tls_type;
+	unsigned int mark;
+	int r = 0;
+
+	BUG_ON(!resp || resp->msg.skb_head || !resp->req
+	       || !resp->req->conn || !stream->xmit.skb_head);
+
+	tls_type = skb_tfw_tls_type(stream->xmit.skb_head);
+	mark = stream->xmit.skb_head->mark;
+	swap(resp->msg.skb_head, stream->xmit.skb_head);
+
+	r = tfw_h2_resp_encode_headers(resp);
+	if (unlikely(r)) {
+		T_WARN("Failed to encode headers");
+		goto finish;
+	}
+
+	stream->xmit.h_len = resp->mit.acc_len;
+	stream->xmit.b_len = TFW_HTTP_RESP_CUT_BODY_SZ(resp);
+	if (test_bit(TFW_HTTP_B_CHUNKED, resp->flags))
+		r = tfw_http_msg_cutoff_body_chunks(resp);
+
+finish:
+	swap(stream->xmit.skb_head, resp->msg.skb_head);
+	ss_skb_setup_head_of_list(stream->xmit.skb_head, mark, tls_type);
+
+	return r;
+}
+
+int
+tfw_h2_entail_stream_skb(struct sock *sk, TfwH2Ctx *ctx, TfwStream *stream,
+			 unsigned int *len, bool should_split)
+{
+	unsigned char tls_type = skb_tfw_tls_type(stream->xmit.skb_head);
+	unsigned int mark = stream->xmit.skb_head->mark;
+	struct sk_buff *skb, *split;
+	int r = 0;
+
+	BUG_ON(!TFW_SKB_CB(stream->xmit.skb_head)->is_head);
+	while (*len) {
+		skb = ss_skb_dequeue(&stream->xmit.skb_head);
+		BUG_ON(!skb);
+
+		if (unlikely(!skb->len)) {
+			T_DBG3("[%d]: %s: drop skb=%px data_len=%u len=%u\n",
+			       smp_processor_id(), __func__,
+			       skb, skb->data_len, skb->len);
+			kfree_skb(skb);
+			continue;
+		}
+
+		BUG_ON(!tls_type);
+		BUG_ON(!skb->len);
+
+		if (skb->len > *len) {
+			if (should_split) {
+				split = ss_skb_split(skb, *len);
+				if (!split) {
+					ss_skb_queue_head(&stream->xmit.skb_head,
+							  skb);
+					r = -ENOMEM;
+					break;
+				}
+
+				ss_skb_queue_head(&stream->xmit.skb_head, split);
+			} else {
+				ss_skb_queue_head(&stream->xmit.skb_head, skb);
+				break;
+			}
+		}
+		*len -= skb->len;
+		 ss_skb_tcp_entail(sk, skb, mark, tls_type);
+	}
+
+	/*
+	 * We use tls_type and mark from skb_head when we entail data in
+	 * socket write queue. So we should set tls_type and mark for the
+	 * new skb_head.
+	 */
+	if (stream->xmit.skb_head
+	    && !TFW_SKB_CB(stream->xmit.skb_head)->is_head) {
+		ss_skb_setup_head_of_list(stream->xmit.skb_head, mark,
+					  tls_type);
+	}
+
+	return r;
+}
diff --git a/fw/http2.h b/fw/http2.h
new file mode 100644
index 000000000..69e5a55dc
--- /dev/null
+++ b/fw/http2.h
@@ -0,0 +1,167 @@
+/**
+ *		Tempesta FW
+ *
+ * Copyright (C) 2024 Tempesta Technologies, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+#ifndef __HTTP2__
+#define __HTTP2__
+
+#include "http_frame.h"
+
+/**
+ * Representation of SETTINGS parameters for HTTP/2 connection (RFC 7540
+ * section 6.5.2).
+ *
+ * @hdr_tbl_sz          - maximum size of the endpoint's header compression
+ *                        table used to decode header blocks;
+ * @push                - enable/disable indicator for server push;
+ * @max_streams         - maximum number of streams that the endpoint will
+ *                        allow;
+ * @wnd_sz              - endpoint's initial window size for stream-level
+ *                        flow control;
+ * @max_frame_sz        - size of the largest frame payload the endpoint wish
+ *                        to receive;
+ * @max_lhdr_sz         - maximum size of header list the endpoint prepared
+ *                        to accept;
+ */
+typedef struct {
+        unsigned int hdr_tbl_sz;
+        unsigned int push;
+        unsigned int max_streams;
+        unsigned int wnd_sz;
+        unsigned int max_frame_sz;
+        unsigned int max_lhdr_sz;
+} TfwSettings;
+
+/**
+ * Context for HTTP/2 frames processing.
+ *
+ * @lock                - spinlock to protect stream-request linkage;
+ * @lsettings           - local settings for HTTP/2 connection;
+ * @rsettings           - settings for HTTP/2 connection received from the
+ *                        remote endpoint;
+ * @lstream_id          - ID of last stream initiated by client and processed
+ *                        on the server side;
+ * @streams_num         - number of the streams initiated by client;
+ * @sched               - streams' priority scheduler;
+ * @closed_streams      - queue of closed streams (in HTTP2_STREAM_CLOSED or
+ *                        HTTP2_STREAM_REM_CLOSED state), which are waiting
+ *                        for removal;
+ * @idle_streams        - queue of idle streams (in HTTP2_STREAM_IDLE) state;
+ * @loc_wnd             - connection's current flow controlled window;
+ * @rem_wnd             - remote peer current flow controlled window;
+ * @hpack               - HPACK context, used in processing of
+ *                        HEADERS/CONTINUATION frames;
+ * @cur_send_headers    - stream for which we have already started sending
+ *                        headers, but have not yet sent the END_HEADERS flag;
+ * @cur_recv_headers    - stream for which we have already started receiving
+ *                        headers, but have not yet received the END_HEADERS
+ *                        flag;
+ * @error               - the stream where the error occurred;
+ * @new_settings        - new settings to apply when ack is pushed to socket
+ *                        write queue;
+ * @settings_to_apply   - bitmap to save what settings we should apply. first
+ *                        bit is used to fast check that we should apply new
+ *                        settings. 1 - _HTTP2_SETTINGS_MAX - 1 bits are used
+ *                        to save what @new_settings should be applyed. bits
+ *                        from _HTTP2_SETTINGS_MAX are used to save what
+ *                        settings we sent to the client;
+ * @__off               - offset to reinitialize processing context;
+ * @skb_head            - collected list of processed skbs containing HTTP/2
+ *                        frames;
+ * @cur_stream          - found stream for the frame currently being processed;
+ * @priority            - unpacked data from priority part of payload of
+ *                        processed HEADERS or PRIORITY frames;
+ * @hdr                 - unpacked data from header of currently processed
+ *                        frame;
+ * @plen                - payload length of currently processed frame
+ *                        (HEADERS/CONTINUATION/DATA frames);
+ * @state               - current FSM state of HTTP/2 processing context;
+ * @to_read             - indicates how much data of HTTP/2 frame should
+ *                        be read on next FSM @state;
+ * @rlen                - length of accumulated data in @rbuf
+ *                        or length of the payload read in current FSM state;
+ * @rbuf                - buffer for data accumulation from frames headers and
+ *                        payloads (for service frames) during frames
+ *                        processing;
+ * @padlen              - length of current frame's padding (if exists);
+ * @data_off            - offset of app data in HEADERS, CONTINUATION and DATA
+ *                        frames (after all service payloads);
+ *
+ * NOTE: we can keep HPACK context in general connection-wide HTTP/2 context
+ * (instead of separate HPACK context for each stream), since frames from other
+ * streams cannot occur between the HEADERS/CONTINUATION frames of particular
+ * stream (RFC 7540, sections 6.2, 6.10, 8.1).
+ */
+typedef struct tfw_h2_ctx_t {
+        spinlock_t      lock;
+        TfwSettings     lsettings;
+        TfwSettings     rsettings;
+        unsigned int    lstream_id;
+        unsigned long   streams_num;
+        TfwStreamSched  sched;
+        TfwStreamQueue  closed_streams;
+        TfwStreamQueue  idle_streams;
+        long int        loc_wnd;
+        long int        rem_wnd;
+        TfwHPack        hpack;
+        TfwStream       *cur_send_headers;
+        TfwStream       *cur_recv_headers;
+        TfwStream       *error;
+        unsigned int    new_settings[_HTTP2_SETTINGS_MAX - 1];
+        DECLARE_BITMAP  (settings_to_apply, 2 * _HTTP2_SETTINGS_MAX - 1);
+        char            __off[0];
+        struct sk_buff  *skb_head;
+        TfwStream       *cur_stream;
+        TfwFramePri     priority;
+        TfwFrameHdr     hdr;
+        unsigned int    plen;
+        int             state;
+        int             to_read;
+        int             rlen;
+        unsigned char   rbuf[FRAME_HEADER_SIZE];
+        unsigned char   padlen;
+        unsigned char   data_off;
+} TfwH2Ctx;
+
+int tfw_h2_init(void);
+void tfw_h2_cleanup(void);
+int tfw_h2_context_init(TfwH2Ctx *ctx);
+void tfw_h2_context_clear(TfwH2Ctx *ctx);
+int tfw_h2_check_settings_entry(TfwH2Ctx *ctx, unsigned short id,
+                                unsigned int val);
+void tfw_h2_save_settings_entry(TfwH2Ctx *ctx, unsigned short id,
+                                unsigned int val);
+void tfw_h2_apply_new_settings(TfwH2Ctx *ctx);
+void tfw_h2_conn_terminate_close(TfwH2Ctx *ctx, TfwH2Err err_code, bool close,
+                                 bool attack);
+void tfw_h2_conn_streams_cleanup(TfwH2Ctx *ctx);
+void tfw_h2_current_stream_remove(TfwH2Ctx *ctx);
+void tfw_h2_remove_idle_streams(TfwH2Ctx *ctx, unsigned int id);
+void tfw_h2_closed_streams_shrink(TfwH2Ctx *ctx);
+void tfw_h2_check_current_stream_is_closed(TfwH2Ctx *ctx);
+TfwStream *tfw_h2_find_not_closed_stream(TfwH2Ctx *ctx, unsigned int id,
+                                         bool recv);
+
+unsigned int tfw_h2_req_stream_id(TfwHttpReq *req);
+void tfw_h2_req_unlink_stream(TfwHttpReq *req);
+void tfw_h2_req_unlink_stream_with_rst(TfwHttpReq *req);
+int tfw_h2_stream_xmit_prepare_resp(TfwStream *stream);
+int tfw_h2_entail_stream_skb(struct sock *sk, TfwH2Ctx *ctx, TfwStream *stream,
+                             unsigned int *len, bool should_split);
+
+#endif /* __HTTP2__ */
diff --git a/fw/http_frame.c b/fw/http_frame.c
index 4ab3f6e1d..7c2465dba 100644
--- a/fw/http_frame.c
+++ b/fw/http_frame.c
@@ -26,6 +26,7 @@
 #include "lib/str.h"
 #include "procfs.h"
 #include "http.h"
+#include "http2.h"
 #include "http_frame.h"
 #include "http_msg.h"
 #include "tcp.h"
@@ -45,34 +46,6 @@
 #define STREAM_ID_SIZE			4
 #define ERR_CODE_SIZE			4
 
-#define MAX_WND_SIZE			((1U << 31) - 1)
-#define DEF_WND_SIZE			((1U << 16) - 1)
-
-#define TFW_MAX_CLOSED_STREAMS		5
-
-/**
- * FSM states for HTTP/2 frames processing.
- */
-typedef enum {
-	HTTP2_RECV_FRAME_HEADER,
-	HTTP2_RECV_CLI_START_SEQ,
-	HTTP2_RECV_FIRST_SETTINGS,
-	HTTP2_RECV_FRAME_PRIORITY,
-	HTTP2_RECV_FRAME_WND_UPDATE,
-	HTTP2_RECV_FRAME_PING,
-	HTTP2_RECV_FRAME_RST_STREAM,
-	HTTP2_RECV_FRAME_SETTINGS,
-	HTTP2_RECV_FRAME_GOAWAY,
-	HTTP2_RECV_FRAME_PADDED,
-	HTTP2_RECV_HEADER_PRI,
-	HTTP2_IGNORE_FRAME_DATA,
-	__HTTP2_RECV_FRAME_APP,
-	HTTP2_RECV_HEADER		= __HTTP2_RECV_FRAME_APP,
-	HTTP2_RECV_CONT,
-	HTTP2_RECV_DATA,
-	HTTP2_RECV_APP_DATA_POST
-} TfwFrameState;
-
 typedef enum {
 	TFW_FRAME_DEFAULT,
 	TFW_FRAME_SHUTDOWN,
@@ -162,13 +135,9 @@ do {									\
 
 #define SET_TO_READ_VERIFY(ctx, next_state)				\
 do {									\
-	typeof(next_state) state = (!ctx->cur_stream ||			\
-		tfw_h2_get_stream_state(ctx->cur_stream) <		\
-		HTTP2_STREAM_LOC_CLOSED) ?				\
-		next_state : HTTP2_IGNORE_FRAME_DATA;			\
 	if ((ctx)->hdr.length) {					\
 		SET_TO_READ(ctx);                                       \
-		(ctx)->state = state;					\
+		(ctx)->state = next_state;				\
 	} else {							\
 		(ctx)->state = HTTP2_IGNORE_FRAME_DATA;			\
 	}								\
@@ -194,76 +163,102 @@ do {									\
 			tfw_h2_conn_terminate((ctx), err);		\
 			return T_BAD;					\
 		} else if (res == STREAM_FSM_RES_TERM_STREAM) {		\
-			return tfw_h2_stream_close((ctx),		\
-						   (hdr)->stream_id,	\
-						   &(ctx)->cur_stream,	\
-						   err);		\
+			ctx->cur_stream = NULL;				\
+			return tfw_h2_send_rst_stream((ctx),		\
+						      (hdr)->stream_id,	\
+						      err);		\
 		}							\
 		return T_OK;						\
 	}								\
 })
 
-int
-tfw_h2_init(void)
+static inline void
+tfw_h2_unpack_priority(TfwFramePri *pri, const unsigned char *buf)
 {
-	return tfw_h2_stream_cache_create();
+	pri->stream_id = ntohl(*(unsigned int *)buf) & FRAME_STREAM_ID_MASK;
+	pri->exclusive = (buf[0] & 0x80) > 0;
+	pri->weight = buf[4] + 1;
 }
 
-void
-tfw_h2_cleanup(void)
-{
-	tfw_h2_stream_cache_destroy();
-}
+/**
+ * The flags indicate that an appropriate SETTINGS parameter is waited for an
+ * update.
+ */
+static const unsigned char
+ctx_new_settings_flags[] = {
+	[HTTP2_SETTINGS_TABLE_SIZE]		= 0x01,
+	[HTTP2_SETTINGS_ENABLE_PUSH]		= 0x02,
+	[HTTP2_SETTINGS_MAX_STREAMS]		= 0x04,
+	[HTTP2_SETTINGS_INIT_WND_SIZE]		= 0x08,
+	[HTTP2_SETTINGS_MAX_FRAME_SIZE] 	= 0x10,
+	[HTTP2_SETTINGS_MAX_HDR_LIST_SIZE]	= 0x20
+};
 
-int
-tfw_h2_context_init(TfwH2Ctx *ctx)
+static void
+tfw_h2_on_tcp_entail_ack(void *conn, struct sk_buff *skb_head)
 {
-	TfwStreamQueue *closed_streams = &ctx->closed_streams;
-	TfwSettings *lset = &ctx->lsettings;
-	TfwSettings *rset = &ctx->rsettings;
-
-	bzero_fast(ctx, sizeof(*ctx));
-
-	ctx->state = HTTP2_RECV_CLI_START_SEQ;
-	ctx->loc_wnd = DEF_WND_SIZE;
-	ctx->rem_wnd = DEF_WND_SIZE;
+	TfwH2Ctx *ctx = tfw_h2_context_unsafe((TfwConn *)conn);
 
-	spin_lock_init(&ctx->lock);
-	INIT_LIST_HEAD(&closed_streams->list);
+	if (test_bit(HTTP2_SETTINGS_NEED_TO_APPLY, ctx->settings_to_apply))
+		tfw_h2_apply_new_settings(ctx);
+}
 
-	lset->hdr_tbl_sz = rset->hdr_tbl_sz = HPACK_TABLE_DEF_SIZE;
-	lset->push = rset->push = 1;
-	lset->max_streams = tfw_cli_max_concurrent_streams;
-	rset->max_streams = 0xffffffff;
-	lset->max_frame_sz = rset->max_frame_sz = FRAME_DEF_LENGTH;
-	lset->max_lhdr_sz = max_header_list_size ?
-		max_header_list_size : UINT_MAX;
-	rset->max_lhdr_sz = UINT_MAX;
+static int
+tfw_h2_on_send_goaway(void *conn, struct sk_buff **skb_head)
+{
+	TfwH2Ctx *ctx = tfw_h2_context_unsafe((TfwConn *)conn);
 
-	lset->wnd_sz = DEF_WND_SIZE;
-	rset->wnd_sz = DEF_WND_SIZE;
+	if (ctx->error && ctx->error->xmit.skb_head) {
+		ss_skb_queue_splice(&ctx->error->xmit.skb_head, skb_head);
+	} else if (ctx->cur_send_headers) {
+		/*
+		 * Other frames (from any stream) MUST NOT occur between
+		 * the HEADERS frame and any CONTINUATION frames that might
+		 * follow. Send goaway later.
+		 */
+		ctx->error = ctx->cur_send_headers;
+		ss_skb_queue_splice(&ctx->error->xmit.skb_head, skb_head);
+	}
 
-	return tfw_hpack_init(&ctx->hpack, HPACK_TABLE_DEF_SIZE);
+	return 0;
 }
 
-void
-tfw_h2_context_clear(TfwH2Ctx *ctx)
+static int
+tfw_h2_on_send_rst_stream(void *conn, struct sk_buff **skb_head)
 {
-	WARN_ON_ONCE(ctx->streams_num);
+	TfwH2Ctx *ctx = tfw_h2_context_unsafe((TfwConn *)conn);
+	unsigned int stream_id = TFW_SKB_CB(*skb_head)->stream_id;
+	TfwStream *stream;
+
+	stream = tfw_h2_find_not_closed_stream(ctx, stream_id, false);
+
 	/*
-	 * Free POSTPONED SKBs. This is necessary when h2 context has
-	 * postponed frames and connection closing initiated.
+	 * Send RST STREAM after all pending data otherwise directly push it
+	 * to socket write queue.
+	 * Stream can not exist in case when we send RST stream because a
+	 * remote peer exceeded max_concurrent_streams limit.
 	 */
-	ss_skb_queue_purge(&ctx->skb_head);
-	tfw_hpack_clean(&ctx->hpack);
+	if (stream && stream->xmit.skb_head) {
+		ss_skb_queue_splice(&stream->xmit.skb_head, skb_head);
+	} else if (ctx->cur_send_headers) {
+		ss_skb_queue_splice(&ctx->cur_send_headers->xmit.postponed,
+				    skb_head);
+	}
+
+	return 0;
 }
 
-static inline void
-tfw_h2_unpack_priority(TfwFramePri *pri, const unsigned char *buf)
+static int
+tfw_h2_on_send_dflt(void *conn, struct sk_buff **skb_head)
 {
-	pri->stream_id = ntohl(*(unsigned int *)buf) & FRAME_STREAM_ID_MASK;
-	pri->exclusive = (buf[0] & 0x80) > 0;
-	pri->weight = buf[4] + 1;
+	TfwH2Ctx *ctx = tfw_h2_context_unsafe((TfwConn *)conn);
+
+	if (ctx->cur_send_headers) {
+		ss_skb_queue_splice(&ctx->cur_send_headers->xmit.postponed,
+				    skb_head);
+	}
+
+	return 0;
 }
 
 /**
@@ -315,12 +310,18 @@ __tfw_h2_send_frame(TfwH2Ctx *ctx, TfwFrameHdr *hdr, TfwStr *data,
 		break;
 	}
 
-	if (hdr->flags == HTTP2_F_ACK &&
-	    (ctx->new_settings.flags & SS_F_HTTP2_ACK_FOR_HPACK_TBL_RESIZING))
-	{
-		skb_set_tfw_flags(it.skb, SS_F_HTTP2_ACK_FOR_HPACK_TBL_RESIZING);
-		skb_set_tfw_cb(it.skb, ctx->new_settings.hdr_tbl_sz);
-		ctx->new_settings.flags &= ~SS_F_HTTP2_ACK_FOR_HPACK_TBL_RESIZING;
+	if (hdr->type == HTTP2_GOAWAY) {
+		TFW_SKB_CB(msg.skb_head)->on_send = tfw_h2_on_send_goaway;
+	} else if (hdr->type == HTTP2_RST_STREAM) {
+		TFW_SKB_CB(msg.skb_head)->on_send = tfw_h2_on_send_rst_stream;
+		TFW_SKB_CB(msg.skb_head)->stream_id = hdr->stream_id;
+	} else {
+		TFW_SKB_CB(msg.skb_head)->on_send = tfw_h2_on_send_dflt;
+	}
+
+	if (hdr->type == HTTP2_SETTINGS && hdr->flags == HTTP2_F_ACK) {
+		TFW_SKB_CB(msg.skb_head)->on_tcp_entail =
+			tfw_h2_on_tcp_entail_ack;
 	}
 
 	if ((r = tfw_connection_send((TfwConn *)conn, &msg)))
@@ -443,12 +444,14 @@ tfw_h2_send_settings_init(TfwH2Ctx *ctx)
 
 	field[0].key   = htons(HTTP2_SETTINGS_TABLE_SIZE);
 	field[0].value = htonl(HPACK_ENC_TABLE_MAX_SIZE);
-	ctx->sent_settings[HTTP2_SETTINGS_TABLE_SIZE] = true;
+	__set_bit(_HTTP2_SETTINGS_MAX - 1 + HTTP2_SETTINGS_TABLE_SIZE,
+		  ctx->settings_to_apply);
 
 	BUILD_BUG_ON(SETTINGS_VAL_SIZE != sizeof(ctx->lsettings.wnd_sz));
 	field[1].key   = htons(HTTP2_SETTINGS_INIT_WND_SIZE);
 	field[1].value = htonl(ctx->lsettings.wnd_sz);
-	ctx->sent_settings[HTTP2_SETTINGS_INIT_WND_SIZE] = true;
+	__set_bit(_HTTP2_SETTINGS_MAX -1 + HTTP2_SETTINGS_INIT_WND_SIZE,
+		  ctx->settings_to_apply);
 
 	field[2].key   = htons(HTTP2_SETTINGS_MAX_STREAMS);
 	field[2].value = htonl(ctx->lsettings.max_streams);
@@ -458,7 +461,9 @@ tfw_h2_send_settings_init(TfwH2Ctx *ctx)
 			htons(HTTP2_SETTINGS_MAX_HDR_LIST_SIZE);
 		field[required_fields].value =
 			htonl(ctx->lsettings.max_lhdr_sz);
-		ctx->sent_settings[HTTP2_SETTINGS_MAX_HDR_LIST_SIZE] = true;
+		__set_bit(_HTTP2_SETTINGS_MAX - 1 +
+			  HTTP2_SETTINGS_MAX_HDR_LIST_SIZE,
+			  ctx->settings_to_apply);
 		data.chunks[1].len += sizeof(field[0]);
 		hdr.length += sizeof(field[0]);
 	}
@@ -480,7 +485,7 @@ tfw_h2_send_settings_ack(TfwH2Ctx *ctx)
 	return tfw_h2_send_frame(ctx, &hdr, &data);
 }
 
-static inline int
+int
 tfw_h2_send_goaway(TfwH2Ctx *ctx, TfwH2Err err_code, bool attack)
 {
 	unsigned char id_buf[STREAM_ID_SIZE];
@@ -538,20 +543,6 @@ tfw_h2_send_rst_stream(TfwH2Ctx *ctx, unsigned int id, TfwH2Err err_code)
 	return tfw_h2_send_frame(ctx, &hdr, &data);
 }
 
-void
-tfw_h2_conn_terminate_close(TfwH2Ctx *ctx, TfwH2Err err_code, bool close,
-			    bool attack)
-{
-	TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2);
-
-	if (tfw_h2_send_goaway(ctx, err_code, attack) && close) {
-		if (attack)
-			tfw_connection_close((TfwConn *)conn, true);
-		else
-			tfw_connection_shutdown((TfwConn *)conn, true);
-	}
-}
-
 static inline void
 tfw_h2_conn_terminate(TfwH2Ctx *ctx, TfwH2Err err_code)
 {
@@ -608,160 +599,6 @@ tfw_h2_headers_pri_process(TfwH2Ctx *ctx)
 	return T_OK;
 }
 
-static inline void
-tfw_h2_current_stream_remove(TfwH2Ctx *ctx)
-{
-	T_DBG3("%s: ctx [%p] ctx->cur_stream %p\n", __func__, ctx, ctx->cur_stream);
-	tfw_h2_stream_unlink_lock(ctx, ctx->cur_stream);
-	tfw_h2_stream_clean(ctx, ctx->cur_stream);
-	ctx->cur_stream = NULL;
-}
-
-void
-tfw_h2_conn_streams_cleanup(TfwH2Ctx *ctx)
-{
-	TfwStream *cur, *next;
-	TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2);
-	TfwStreamSched *sched = &ctx->sched;
-
-	WARN_ON_ONCE(((TfwConn *)conn)->stream.msg);
-
-	T_DBG3("%s: ctx [%p] conn %p sched %p\n", __func__, ctx, conn, sched);
-
-	rbtree_postorder_for_each_entry_safe(cur, next, &sched->streams, node) {
-		tfw_h2_stream_unlink_lock(ctx, cur);
-
-		/* The streams tree is about to be destroyed and
-		 * we don't want to trigger rebalancing.
-		 * No further actions regarding streams dependencies/prio
-		 * is required at this stage.
-		 */
-		tfw_h2_delete_stream(cur);
-		--ctx->streams_num;
-	}
-	sched->streams = RB_ROOT;
-}
-
-/*
- * Get stream ID for upper layer to create frames info.
- */
-unsigned int
-tfw_h2_req_stream_id(TfwHttpReq *req)
-{
-	unsigned int id = 0;
-	TfwH2Ctx *ctx = tfw_h2_context_unsafe(req->conn);
-
-	spin_lock(&ctx->lock);
-
-	if (req->stream)
-		id = req->stream->id;
-
-	spin_unlock(&ctx->lock);
-
-	return id;
-}
-
-/*
- * Unlink request from corresponding stream (if linked).
- */
-void
-tfw_h2_req_unlink_stream(TfwHttpReq *req)
-{
-	TfwStream *stream;
-	TfwH2Ctx *ctx = tfw_h2_context_unsafe(req->conn);
-
-	spin_lock(&ctx->lock);
-
-	stream = req->stream;
-	if (!stream) {
-		spin_unlock(&ctx->lock);
-		return;
-	}
-
-	req->stream = NULL;
-	stream->msg = NULL;
-
-	spin_unlock(&ctx->lock);
-}
-
-/*
- * Unlink request from corresponding stream (if linked),
- * send RST STREAM and add stream to closed queue.
- */
-void
-tfw_h2_req_unlink_stream_with_rst(TfwHttpReq *req)
-{
-	TfwStreamFsmRes r;
-	TfwStream *stream;
-	TfwH2Ctx *ctx = tfw_h2_context_unsafe(req->conn);
-
-	spin_lock(&ctx->lock);
-
-	stream = req->stream;
-	if (!stream) {
-		spin_unlock(&ctx->lock);
-		return;
-	}
-
-	req->stream = NULL;
-	stream->msg = NULL;
-
-	r = tfw_h2_stream_fsm_ignore_err(ctx, stream, HTTP2_RST_STREAM, 0);
-	WARN_ON_ONCE(r != STREAM_FSM_RES_OK && r != STREAM_FSM_RES_IGNORE);
-
-	tfw_h2_stream_add_to_queue_nolock(&ctx->closed_streams, stream);
-
-	spin_unlock(&ctx->lock);
-}
-
-/*
- * Clean the queue of closed streams if its size has exceeded a certain
- * value.
- */
-static void
-tfw_h2_closed_streams_shrink(TfwH2Ctx *ctx)
-{
-	TfwStream *cur;
-	TfwStreamQueue *closed_streams = &ctx->closed_streams;
-
-	T_DBG3("%s: ctx [%p] closed streams num %lu\n", __func__, ctx,
-	       closed_streams->num);
-
-	while (1) {
-		spin_lock(&ctx->lock);
-
-		if (closed_streams->num <= TFW_MAX_CLOSED_STREAMS) {
-			spin_unlock(&ctx->lock);
-			break;
-		}
-
-		BUG_ON(list_empty(&closed_streams->list));
-		cur = list_first_entry(&closed_streams->list, TfwStream,
-				       hcl_node);
-		tfw_h2_stream_unlink_nolock(ctx, cur);
-
-		spin_unlock(&ctx->lock);
-
-		T_DBG3("%s: ctx [%p] cur stream [%p]\n", __func__, ctx, cur);
-
-		tfw_h2_stream_clean(ctx, cur);
-	}
-}
-
-static inline void
-tfw_h2_check_closed_stream(TfwH2Ctx *ctx)
-{
-	BUG_ON(!ctx->cur_stream);
-
-	T_DBG3("%s: strm [%p] id %u state %d(%s), streams_num %lu\n",
-	       __func__, ctx->cur_stream, ctx->cur_stream->id,
-	       tfw_h2_get_stream_state(ctx->cur_stream),
-	       __h2_strm_st_n(ctx->cur_stream), ctx->streams_num);
-
-	if (tfw_h2_stream_is_closed(ctx->cur_stream))
-		tfw_h2_current_stream_remove(ctx);
-}
-
 static inline int
 tfw_h2_current_stream_state_process(TfwH2Ctx *ctx)
 {
@@ -769,7 +606,7 @@ tfw_h2_current_stream_state_process(TfwH2Ctx *ctx)
 
 	STREAM_RECV_PROCESS(ctx, hdr);
 
-	tfw_h2_check_closed_stream(ctx);
+	tfw_h2_check_current_stream_is_closed(ctx);
 
 	return T_OK;
 }
@@ -795,8 +632,9 @@ tfw_h2_headers_process(TfwH2Ctx *ctx)
 						 HTTP2_RST_STREAM, 0))
 			return -EPERM;
 
-		return tfw_h2_stream_close(ctx, hdr->stream_id, &ctx->cur_stream,
-					   HTTP2_ECODE_PROTO);
+		ctx->cur_stream = NULL;
+		return tfw_h2_send_rst_stream(ctx, hdr->stream_id,
+					      HTTP2_ECODE_PROTO);
 	}
 
 	if (!ctx->cur_stream) {
@@ -804,6 +642,9 @@ tfw_h2_headers_process(TfwH2Ctx *ctx)
 		if (!ctx->cur_stream)
 			return -ENOMEM;
 		ctx->lstream_id = hdr->stream_id;
+	} else if (ctx->cur_stream->state == HTTP2_STREAM_IDLE) {
+		tfw_h2_stream_remove_idle(ctx, ctx->cur_stream);
+		ctx->lstream_id = hdr->stream_id;
 	}
 	/*
 	 * Since the same received HEADERS frame can cause the stream to become
@@ -847,19 +688,23 @@ tfw_h2_wnd_update_process(TfwH2Ctx *ctx)
 		TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2);
 		long int *window = ctx->cur_stream ?
 			&ctx->cur_stream->rem_wnd : &ctx->rem_wnd;
-		int size, mss;
 
 		if (tfw_h2_increment_wnd_sz(window, wnd_incr)) {
 			err_code = HTTP2_ECODE_FLOW;
 			goto fail;
 		}
 
+		if (ctx->cur_stream)
+			tfw_h2_stream_try_unblock(&ctx->sched, ctx->cur_stream);
+
 		if (*window > 0) {
-			mss = tcp_send_mss(((TfwConn *)conn)->sk, &size,
-					   MSG_DONTWAIT);
-			tcp_push(((TfwConn *)conn)->sk, MSG_DONTWAIT, mss,
-				 TCP_NAGLE_OFF|TCP_NAGLE_PUSH, size);
+			if (tfw_h2_stream_sched_is_active(&ctx->sched.root)) {
+				sock_set_flag(((TfwConn *)conn)->sk,
+					       SOCK_TEMPESTA_HAS_DATA);
+				tcp_push_pending_frames(((TfwConn *)conn)->sk);
+			}
 		}
+
 		return T_OK;
 	}
 
@@ -873,8 +718,9 @@ tfw_h2_wnd_update_process(TfwH2Ctx *ctx)
 					 HTTP2_RST_STREAM, 0))
 		return -EPERM;
 
-	return tfw_h2_stream_close(ctx, hdr->stream_id, &ctx->cur_stream,
-				   err_code);
+	ctx->cur_stream = NULL;
+	return tfw_h2_send_rst_stream(ctx, hdr->stream_id,
+				      err_code);
 }
 
 static inline int
@@ -896,6 +742,18 @@ tfw_h2_priority_process(TfwH2Ctx *ctx)
 		return T_OK;
 	}
 
+	if (ctx->cur_stream->state == HTTP2_STREAM_IDLE) {
+		/*
+		 * According to RFC 9113 we should response with stream
+		 * error of type PROTOCOL ERROR here, but we can't send
+		 * RST_STREAM for idle stream.
+		 * RFC 9113 doesn't describe this case, so terminate
+		 * connection.
+		 */
+		tfw_h2_conn_terminate(ctx, HTTP2_ECODE_PROTO);
+		return T_BAD;
+	}
+
 	/*
 	 * Stream cannot depend on itself (see RFC 7540 section 5.1.2 for
 	 * details).
@@ -907,8 +765,9 @@ tfw_h2_priority_process(TfwH2Ctx *ctx)
 					 HTTP2_RST_STREAM, 0))
 		return -EPERM;
 
-	return tfw_h2_stream_close(ctx, hdr->stream_id, &ctx->cur_stream,
-				   HTTP2_ECODE_PROTO);
+	ctx->cur_stream = NULL;
+	return tfw_h2_send_rst_stream(ctx, hdr->stream_id,
+				      HTTP2_ECODE_PROTO);
 }
 
 static inline void
@@ -922,93 +781,19 @@ tfw_h2_rst_stream_process(TfwH2Ctx *ctx)
 	tfw_h2_current_stream_remove(ctx);
 }
 
-static void
-tfw_h2_apply_wnd_sz_change(TfwH2Ctx *ctx, long int delta)
-{
-	TfwStream *stream, *next;
-
-	/*
-	 * Order is no matter, use default funtion from the Linux kernel.
-	 * According to RFC 9113 6.9.2
-	 * When the value of SETTINGS_INITIAL_WINDOW_SIZE changes, a receiver
-	 * MUST adjust the size of all stream flow-control windows that it
-	 * maintains by the difference between the new value and the old value.
-	 * A change to SETTINGS_INITIAL_WINDOW_SIZE can cause the available
-	 * space in a flow-control window to become negative.
-	 */
-	rbtree_postorder_for_each_entry_safe(stream, next,
-					     &ctx->sched.streams, node) {
-		TfwStreamState state = tfw_h2_get_stream_state(stream);
-		if (state == HTTP2_STREAM_OPENED ||
-		    state == HTTP2_STREAM_REM_HALF_CLOSED)
-			stream->rem_wnd += delta;
-	}
-}
-
-static int
-tfw_h2_apply_settings_entry(TfwH2Ctx *ctx, unsigned short id,
-			    unsigned int val)
-{
-	TfwSettings *dest = &ctx->rsettings;
-	long int delta;
-
-	switch (id) {
-	case HTTP2_SETTINGS_TABLE_SIZE:
-		ctx->new_settings.flags |= SS_F_HTTP2_ACK_FOR_HPACK_TBL_RESIZING;
-		ctx->new_settings.hdr_tbl_sz = min_t(unsigned int,
-						     val, HPACK_ENC_TABLE_MAX_SIZE);
-		break;
-
-	case HTTP2_SETTINGS_ENABLE_PUSH:
-		if (val > 1)
-			return -EINVAL;
-		dest->push = val;
-		break;
-
-	case HTTP2_SETTINGS_MAX_STREAMS:
-		dest->max_streams = val;
-		break;
-
-	case HTTP2_SETTINGS_INIT_WND_SIZE:
-		if (val > MAX_WND_SIZE)
-			return -EINVAL;
-
-		delta = (long int)val - (long int)dest->wnd_sz;
-		tfw_h2_apply_wnd_sz_change(ctx, delta);
-		dest->wnd_sz = val;
-		break;
-
-	case HTTP2_SETTINGS_MAX_FRAME_SIZE:
-		if (val < FRAME_DEF_LENGTH || val > FRAME_MAX_LENGTH)
-			return -EINVAL;
-		dest->max_frame_sz = val;
-		break;
-
-	case HTTP2_SETTINGS_MAX_HDR_LIST_SIZE:
-		dest->max_lhdr_sz = val;
-		break;
-
-	default:
-		/*
-		 * We should silently ignore unknown identifiers (see
-		 * RFC 7540 section 6.5.2)
-		 */
-		return 0;
-	}
-
-	return 0;
-}
-
 static void
 tfw_h2_settings_ack_process(TfwH2Ctx *ctx)
 {
 	T_DBG3("%s: parsed, stream_id=%u, flags=%hhu\n", __func__,
 	       ctx->hdr.stream_id, ctx->hdr.flags);
 
-	if (ctx->sent_settings[HTTP2_SETTINGS_TABLE_SIZE]) {
+	if (test_bit(_HTTP2_SETTINGS_MAX - 1 + HTTP2_SETTINGS_TABLE_SIZE,
+		     ctx->settings_to_apply))
+	{
 		ctx->hpack.max_window = ctx->lsettings.hdr_tbl_sz;
 		ctx->hpack.dec_tbl.wnd_update = true;
-		ctx->sent_settings[HTTP2_SETTINGS_TABLE_SIZE] = false;
+		clear_bit(_HTTP2_SETTINGS_MAX -1 + HTTP2_SETTINGS_TABLE_SIZE,
+			  ctx->settings_to_apply);
 	}
 }
 
@@ -1022,9 +807,11 @@ tfw_h2_settings_process(TfwH2Ctx *ctx)
 
 	T_DBG3("%s: entry parsed, id=%hu, val=%u\n", __func__, id, val);
 
-	if ((r = tfw_h2_apply_settings_entry(ctx, id, val)))
+	if ((r = tfw_h2_check_settings_entry(ctx, id, val)))
 		return r;
 
+	tfw_h2_save_settings_entry(ctx, id, val);
+
 	ctx->to_read = hdr->length ? FRAME_SETTINGS_ENTRY_SIZE : 0;
 	hdr->length -= ctx->to_read;
 
@@ -1236,9 +1023,6 @@ tfw_h2_frame_type_process(TfwH2Ctx *ctx)
 		(hdr->type <= _HTTP2_UNDEFINED ? hdr->type : _HTTP2_UNDEFINED);
 	TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2);
 
-/*
- * TODO: Use this macro for processing PRIORITY frame.
- */
 #define VERIFY_MAX_CONCURRENT_STREAMS(ctx, ACTION)			\
 do {									\
 	unsigned int max_streams = ctx->lsettings.max_streams;		\
@@ -1247,7 +1031,7 @@ do {									\
 									\
 	if (max_streams == ctx->streams_num) {				\
 		T_WARN("Max streams number exceeded: %lu\n",		\
-		      ctx->streams_num);				\
+		       ctx->streams_num);				\
 		SET_TO_READ_VERIFY(ctx, HTTP2_IGNORE_FRAME_DATA);	\
 		ACTION;							\
 	}								\
@@ -1313,13 +1097,7 @@ do {									\
 		if (hdr->flags & HTTP2_F_PADDED)
 			return tfw_h2_recv_padded(ctx);
 
-		/* TODO: #1196 Rework this part. */
-		if (tfw_h2_get_stream_state(ctx->cur_stream) >=
-		    HTTP2_STREAM_LOC_CLOSED)
-			ctx->state = HTTP2_IGNORE_FRAME_DATA;
-		else
-			ctx->state = HTTP2_RECV_DATA;
-
+		ctx->state = HTTP2_RECV_DATA;
 		SET_TO_READ(ctx);
 
 		return 0;
@@ -1337,6 +1115,9 @@ do {									\
 			err_code = HTTP2_ECODE_PROTO;
 			goto conn_term;
 		}
+
+		tfw_h2_remove_idle_streams(ctx, hdr->stream_id);
+
 		/*
 		 * Endpoints must not exceed the limit set by their peer for
 		 * maximum number of concurrent streams (see RFC 7540 section
@@ -1357,17 +1138,9 @@ do {									\
 		if (hdr->flags & HTTP2_F_PRIORITY)
 			return tfw_h2_recv_priority(ctx);
 
-		/* TODO: #1196 Rework this part. */
-		if (ctx->cur_stream &&
-		    tfw_h2_get_stream_state(ctx->cur_stream) >=
-		    HTTP2_STREAM_LOC_CLOSED)
-		{
-			ctx->state = HTTP2_IGNORE_FRAME_DATA;
-		} else {
-			ctx->state = HTTP2_RECV_HEADER;
-		}
-
+		ctx->state = HTTP2_RECV_HEADER;
 		SET_TO_READ(ctx);
+
 		return 0;
 
 	case HTTP2_PRIORITY:
@@ -1377,26 +1150,39 @@ do {									\
 		}
 
 		ctx->cur_stream =
-			tfw_h2_find_not_closed_stream(ctx, hdr->stream_id,
-						      true);
+			tfw_h2_find_stream(&ctx->sched, hdr->stream_id);
 		if (hdr->length != FRAME_PRIORITY_SIZE)
 			goto conn_term;
 
-		/* TODO: #1196 Rework this part. */
-		if (!ctx->cur_stream ||
-		    tfw_h2_get_stream_state(ctx->cur_stream) >=
-		    HTTP2_STREAM_LOC_CLOSED)
-		{
+		if (ctx->cur_stream) {
+			STREAM_RECV_PROCESS(ctx, hdr);
+			ctx->state = HTTP2_RECV_FRAME_PRIORITY;
+		} else if (hdr->stream_id > ctx->lstream_id) {
+			VERIFY_MAX_CONCURRENT_STREAMS(ctx, {
+				err_code = HTTP2_ECODE_PROTO;
+				goto conn_term;
+			});
+			/*
+			 * According to RFC 9113 section 6.3:
+			 * Priority frame can be sent in any stream state,
+			 * including idle or closed streams.
+			 */
+			ctx->cur_stream =
+				tfw_h2_stream_create(ctx, hdr->stream_id);
+			if (!ctx->cur_stream)
+				return -ENOMEM;
+
+			tfw_h2_stream_add_idle(ctx, ctx->cur_stream);
+			STREAM_RECV_PROCESS(ctx, hdr);
+			ctx->state = HTTP2_RECV_FRAME_PRIORITY;
+		} else {
 			/*
 			 * According to RFC 9113 section 5.1:
 			 * PRIORITY frames are allowed in the `closed` state,
-			 * but if the stream was moved to closed queue or was
-			 * already removed from memory, just ignore this frame.
+			 * but if the stream was already removed from memory,
+			 * just ignore this frame.
 			 */
 			ctx->state = HTTP2_IGNORE_FRAME_DATA;
-		} else {
-			STREAM_RECV_PROCESS(ctx, hdr);
-			ctx->state = HTTP2_RECV_FRAME_PRIORITY;
 		}
 
 		SET_TO_READ(ctx);
@@ -1772,8 +1558,14 @@ tfw_h2_frame_recv(void *data, unsigned char *buf, unsigned int len,
 		if ((ret = tfw_h2_current_stream_state_process(ctx)))
 			FRAME_FSM_EXIT(ret);
 
+		if (unlikely(ctx->state == HTTP2_IGNORE_FRAME_DATA))
+			__fsm_const_state = ctx->state;
+
 		if (unlikely(ctx->to_read)) {
-			FRAME_FSM_MOVE(HTTP2_RECV_APP_DATA_POST);
+			if (unlikely(ctx->state == HTTP2_IGNORE_FRAME_DATA))
+				FRAME_FSM_MOVE(HTTP2_IGNORE_FRAME_DATA);
+			else
+				FRAME_FSM_MOVE(HTTP2_RECV_APP_DATA_POST);
 		}
 
 		FRAME_FSM_EXIT(T_OK);
@@ -1785,8 +1577,14 @@ tfw_h2_frame_recv(void *data, unsigned char *buf, unsigned int len,
 		if ((ret = tfw_h2_headers_process(ctx)))
 			FRAME_FSM_EXIT(ret);
 
+		if (unlikely(ctx->state == HTTP2_IGNORE_FRAME_DATA))
+			__fsm_const_state = ctx->state;
+
 		if (unlikely(ctx->to_read)) {
-			FRAME_FSM_MOVE(HTTP2_RECV_APP_DATA_POST);
+			if (unlikely(ctx->state == HTTP2_IGNORE_FRAME_DATA))
+				FRAME_FSM_MOVE(HTTP2_IGNORE_FRAME_DATA);
+			else
+				FRAME_FSM_MOVE(HTTP2_RECV_APP_DATA_POST);
 		}
 
 		FRAME_FSM_EXIT(T_OK);
@@ -1798,8 +1596,14 @@ tfw_h2_frame_recv(void *data, unsigned char *buf, unsigned int len,
 		if ((ret = tfw_h2_current_stream_state_process(ctx)))
 			FRAME_FSM_EXIT(ret);
 
+		if (unlikely(ctx->state == HTTP2_IGNORE_FRAME_DATA))
+			__fsm_const_state = ctx->state;
+
 		if (unlikely(ctx->to_read)) {
-			FRAME_FSM_MOVE(HTTP2_RECV_APP_DATA_POST);
+			if (unlikely(ctx->state == HTTP2_IGNORE_FRAME_DATA))
+				FRAME_FSM_MOVE(HTTP2_IGNORE_FRAME_DATA);
+			else
+				FRAME_FSM_MOVE(HTTP2_RECV_APP_DATA_POST);
 		}
 
 		FRAME_FSM_EXIT(T_OK);
@@ -2072,240 +1876,348 @@ tfw_h2_frame_process(TfwConn *c, struct sk_buff *skb, struct sk_buff **next)
 	return r;
 }
 
-int
-tfw_h2_insert_frame_header(struct sock *sk,  struct sk_buff *skb,
-			   TfwStream *stream, unsigned int mss_now,
-			   TfwMsgIter *it, char **data,
-			   const TfwStr *frame_hdr_str,
-			   unsigned int *t_tz)
+static inline unsigned int
+tfw_h2_calc_frame_length(TfwH2Ctx *ctx, TfwStream *stream, TfwFrameType type,
+			 unsigned int len, unsigned int max_len)
 {
-	struct sk_buff *next = NULL;
-	unsigned len = skb->len, next_len = 0;
-	unsigned int truesize = skb->truesize, next_truesize = 0;
-	unsigned long clear_mask;
-	long tmp_t_tz, delta;
-	int r;
-
-#define __ADJUST_SKB_LEN_CHANGE(sk, skb, olen)				\
-	delta = (long)skb->len - (long)olen;				\
-	TCP_SKB_CB(skb)->end_seq += delta;				\
-	tcp_sk(sk)->write_seq += delta;
-
-	if (likely(!tcp_skb_is_last(sk, skb))) {
-		next = skb_queue_next(&sk->sk_write_queue, skb);
-		next_len = next->len;
-		next_truesize = next->truesize;
-	}
-
-	r = tfw_http_msg_insert(it, data, frame_hdr_str);
-	if (unlikely(r))
-		return r;
+	unsigned int length;
 
-	__ADJUST_SKB_LEN_CHANGE(sk, skb, len);
-	tmp_t_tz = (long)skb->truesize - (long)truesize;
-	/*
-	 * If all HEADERS are located in current skb, we should clear
-	 * an appropriate flag in the next skb.
-	 */
-	clear_mask = (skb->len - FRAME_HEADER_SIZE >= stream->xmit.h_len ?
-		SS_F_HTTT2_FRAME_HEADERS : 0);
-
-	if (!tcp_skb_is_last(sk, skb)
-	    && skb->next != next) {
-		/* New skb was allocated during data insertion. */
-		next = skb->next;
-		/* Remove skb since it must be inserted into sk write queue. */
-		ss_skb_remove(next);
-
-		tcp_sk(sk)->write_seq += next->len;
-
-		tfw_tcp_setup_new_skb(sk, skb, next, mss_now);
-		skb_copy_tfw_cb(next, skb);
-		skb_clear_tfw_flag(next, clear_mask);
-		tmp_t_tz += next->truesize;
-	} else if (next && next->len != next_len) {
-		/* Some frags from current skb was moved to the next skb. */
-		BUG_ON(next->len < next_len);
-		__ADJUST_SKB_LEN_CHANGE(sk, next, next_len);
-
-		tfw_tcp_propagate_dseq(sk, skb);
-		skb_copy_tfw_cb(next, skb);
-		skb_clear_tfw_flag(next, clear_mask);
-		tmp_t_tz += (long)next->truesize - (long)next_truesize;
-	} else {
-		tfw_tcp_propagate_dseq(sk, skb);
+	length = min3(ctx->rsettings.max_frame_sz, len, max_len);
+	if (type == HTTP2_DATA) {
+		length = min3(length, (unsigned int)ctx->rem_wnd,
+			      (unsigned int)stream->rem_wnd);
 	}
 
-	BUG_ON(tmp_t_tz < 0);
-	*t_tz = tmp_t_tz;
-
-	return r;
-
-#undef __ADJUST_SKB_LEN_CHANGE
+	return length;
 }
 
-static unsigned char
-tfw_h2_prepare_frame_flags(TfwStream *stream, TfwFrameType type, bool end)
+static inline char
+tf2_h2_calc_frame_flags(TfwStream *stream, TfwFrameType type)
 {
-	unsigned char flags;
-
 	switch (type) {
 	case HTTP2_HEADERS:
+		return stream->xmit.h_len ?
+			(stream->xmit.b_len ? 0 : HTTP2_F_END_STREAM) :
+			(stream->xmit.b_len ? HTTP2_F_END_HEADERS :
+			 HTTP2_F_END_HEADERS | HTTP2_F_END_STREAM);
 	case HTTP2_CONTINUATION:
-		flags = stream->xmit.b_len ? 0 : HTTP2_F_END_STREAM;
-		flags |= end ? HTTP2_F_END_HEADERS : 0;
-		break;
+		return stream->xmit.h_len ? 0 : HTTP2_F_END_HEADERS;
 	case HTTP2_DATA:
-		flags = end ? HTTP2_F_END_STREAM : 0;
-		break;
+		return stream->xmit.b_len ? 0 : HTTP2_F_END_STREAM;
 	default:
 		BUG();
 	};
 
-	return flags;
+	return 0;
 }
 
-static unsigned int
-tfw_h2_calc_frame_length(struct sock *sk, struct sk_buff *skb, TfwH2Ctx *ctx,
-			 TfwStream *stream, TfwFrameType type,
-			 unsigned int limit, unsigned int len)
+static inline int
+tfw_h2_insert_frame_header(struct sock *sk, TfwH2Ctx *ctx, TfwStream *stream,
+			   TfwFrameType type, unsigned long *snd_wnd,
+			   unsigned long len)
 {
-	unsigned  char tls_type =  skb_tfw_tls_type(skb);
-	unsigned short clear_flag = (type == HTTP2_DATA ?
-		SS_F_HTTT2_FRAME_DATA :  SS_F_HTTT2_FRAME_HEADERS);
-	unsigned short other_flag = (type == HTTP2_DATA ?
-		SS_F_HTTT2_FRAME_HEADERS : SS_F_HTTT2_FRAME_DATA);
-	unsigned int max_sz = min3(ctx->rsettings.max_frame_sz, limit, len);
-	unsigned int frame_sz = skb->len - FRAME_HEADER_SIZE -
-		stream->xmit.processed;
-	struct sk_buff *next = skb, *skb_tail = skb;
+	TfwMsgIter it = {
+		.skb_head = stream->xmit.skb_head,
+		.skb = stream->xmit.skb_head,
+		.frag = -1
+	};
+	unsigned char buf[FRAME_HEADER_SIZE];
+	const TfwStr frame_hdr_str = { .data = buf, .len = sizeof(buf)};
+	TfwFrameHdr frame_hdr = {};
+	unsigned char tls_type = skb_tfw_tls_type(stream->xmit.skb_head);
+	unsigned int mark = stream->xmit.skb_head->mark;
+	unsigned int max_len = (*snd_wnd > TLS_MAX_PAYLOAD_SIZE + TLS_MAX_OVERHEAD) ?
+		TLS_MAX_PAYLOAD_SIZE : *snd_wnd - TLS_MAX_OVERHEAD;
+	unsigned int length;
+	char *data;
+	int r;
+
+
+	/*
+	 * Very unlikely case, when skb_head and one or more next skbs
+	 * are empty because of transformation during making HEADERS.
+	 */
+	if (type == HTTP2_CONTINUATION || type == HTTP2_DATA) {
+		struct sk_buff *skb = stream->xmit.skb_head;
+
+		while (skb && unlikely(!skb->len)) {
+			ss_skb_unlink(&stream->xmit.skb_head, skb);
+			kfree_skb(skb);
+			skb = stream->xmit.skb_head;
+		}
+	}
+
+	data = ss_skb_data_ptr_by_offset(stream->xmit.skb_head,
+					 stream->xmit.frame_length);
+	BUG_ON(!data);
+
+	if (type == HTTP2_CONTINUATION || type == HTTP2_DATA) {
+		it.skb = it.skb_head = stream->xmit.skb_head;
+		if ((r = tfw_http_msg_insert(&it, &data, &frame_hdr_str)))
+			return r;
+		stream->xmit.skb_head = it.skb_head;
+	}
 
+	/*
+	 * Set tls_type and mark, because skb_head could be changed
+	 * during previous operations.
+	 */
+	ss_skb_setup_head_of_list(stream->xmit.skb_head, mark, tls_type);
+
+	length = tfw_h2_calc_frame_length(ctx, stream, type, len,
+					  max_len - FRAME_HEADER_SIZE);
 	if (type == HTTP2_DATA) {
-		BUG_ON(ctx->rem_wnd <= 0 || stream->rem_wnd <= 0);
-		max_sz = min3(max_sz, (unsigned int)ctx->rem_wnd,
-			      (unsigned int)stream->rem_wnd);
+		ctx->rem_wnd -= length;
+		stream->rem_wnd -= length;
+		stream->xmit.b_len -= length;
+	} else {
+		stream->xmit.h_len -= length;
 	}
 
-	while (!tcp_skb_is_last(sk, skb_tail)) {
-		next = skb_queue_next(&sk->sk_write_queue, skb_tail);
+	*snd_wnd -= length;
 
-		if (frame_sz + next->len > max_sz)
-			break;
-		/* Don't put different message types into the same frame. */
-		if (skb_tfw_tls_type(next) != tls_type)
-			break;
-		/* Don't agregate skbs with different frame types. */
-		if (skb_tfw_flags(next) & other_flag)
-			break;
-		skb_clear_tfw_flag(next, clear_flag);
-		skb_set_tfw_flags(next, SS_F_HTTP2_FRAME_PREPARED);
-		stream->xmit.nskbs++;
-		frame_sz += next->len;
-		skb_tail = next;
+	frame_hdr.length = length;
+	frame_hdr.stream_id = stream->id;
+	frame_hdr.type = type;
+	frame_hdr.flags = tf2_h2_calc_frame_flags(stream, type);
+	tfw_h2_pack_frame_header(data, &frame_hdr);
+
+	stream->xmit.frame_length += length + FRAME_HEADER_SIZE;
+	switch (tfw_h2_stream_send_process(ctx, stream, type)) {
+	case STREAM_FSM_RES_OK:
+	case STREAM_FSM_RES_IGNORE:
+		break;
+	case STREAM_FSM_RES_TERM_STREAM:
+		/* Send previosly successfully prepared frames if exist. */
+		stream->xmit.frame_length -= length + FRAME_HEADER_SIZE;
+		if (stream->xmit.frame_length) {
+			r = tfw_h2_entail_stream_skb(sk, ctx, stream,
+						     &stream->xmit.frame_length,
+						     true);
+		}
+		stream->xmit.frame_length += length + FRAME_HEADER_SIZE;
+		/*
+		 * Purge stream send queue, but leave postponed
+		 * skbs and rst stream/goaway/tls alert if exist.
+		 */
+		tfw_h2_stream_purge_send_queue(stream);
+		return r;
+	case STREAM_FSM_RES_TERM_CONN:
+		return -EPIPE;
 	}
 
-	return min(max_sz, frame_sz);
+	return 0;
 }
 
 static int
-tfw_h2_make_frames(struct sock *sk, struct sk_buff *skb, TfwH2Ctx *ctx,
-		   TfwStream *stream, TfwFrameType type, unsigned int mss_now,
-		   unsigned int limit, unsigned int *t_tz)
+tfw_h2_stream_xmit_process(struct sock *sk, TfwH2Ctx *ctx, TfwStream *stream,
+			   int ss_action, unsigned long *snd_wnd)
 {
 	int r = 0;
-	char *data;
-	unsigned char buf[FRAME_HEADER_SIZE];
-	const TfwStr frame_hdr_str = { .data = buf, .len = sizeof(buf)};
-	TfwMsgIter it = {
-		.skb = skb,
-		.skb_head = ((struct sk_buff *)&sk->sk_write_queue),
-		.frag = -1
-	};
-	TfwFrameHdr frame_hdr = {};
-	unsigned long *len = (type == HTTP2_DATA ?
-		&stream->xmit.b_len : &stream->xmit.h_len);
+	TfwFrameType frame_type;
+	T_FSM_INIT(stream->xmit.state, "HTTP/2 make frames");
 
-	if (WARN_ON_ONCE(limit <= FRAME_HEADER_SIZE))
-		return -EINVAL;
+#define CALC_SND_WND_AND_SET_FRAME_TYPE(type)				\
+do {									\
+	if (*snd_wnd <= FRAME_HEADER_SIZE + TLS_MAX_OVERHEAD)		\
+		T_FSM_EXIT();						\
+	frame_type = type;						\
+} while(0)
 
-	data = tfw_http_iter_set_at_skb(&it, skb, stream->xmit.processed);
-	if (!data)
-		return -E2BIG;
 
-	if (type != HTTP2_HEADERS) {
-		/*
-		 * Insert empty header first, because if some fragments will
-		 * be moved from current skb to the next one, skb length will
-		 * be changed.
-		 */
-		r = tfw_h2_insert_frame_header(sk, skb, stream, mss_now, &it,
-					       &data, &frame_hdr_str, t_tz);
+	T_FSM_START(stream->xmit.state) {
+
+	T_FSM_STATE(HTTP2_ENCODE_HEADERS) {
+		r = tfw_h2_stream_xmit_prepare_resp(stream);
+		fallthrough;
+	}
+
+	T_FSM_STATE(HTTP2_RELEASE_RESPONSE) {
+		TfwHttpResp *resp = stream->xmit.resp;
+
+		BUG_ON(!resp || !resp->req || !resp->req->conn);
+		tfw_http_resp_pair_free_and_put_conn(resp);
+		stream->xmit.resp = NULL;
+		/* Error during headers encoding. */
 		if (unlikely(r))
 			return r;
+		fallthrough;
 	}
 
-	limit -= FRAME_HEADER_SIZE;
+	T_FSM_STATE(HTTP2_MAKE_HEADERS_FRAMES) {
+		CALC_SND_WND_AND_SET_FRAME_TYPE(HTTP2_HEADERS);
+		if (unlikely(ctx->hpack.enc_tbl.wnd_changed)) {
+			r = tfw_hpack_enc_tbl_write_sz(&ctx->hpack.enc_tbl,
+						       stream);
+			if (unlikely(r < 0)) {
+				T_WARN("Failed to encode hpack dynamic"
+				       "table size %d", r);
+				return r;
+			}
+		}
 
-	frame_hdr.stream_id = stream->id;
-	frame_hdr.type = type;
-	frame_hdr.length = tfw_h2_calc_frame_length(sk, skb, ctx, stream,
-						    type, limit, *len);
-	frame_hdr.flags = tfw_h2_prepare_frame_flags(stream, type,
-						     *len == frame_hdr.length);
-	tfw_h2_pack_frame_header(data, &frame_hdr);
+		r = tfw_h2_insert_frame_header(sk, ctx, stream, frame_type,
+					       snd_wnd, stream->xmit.h_len);
+		if (unlikely(r)) {
+			T_WARN("Failed to make headers frame %d", r);
+			return r;
+		}
 
-	if (type == HTTP2_DATA) {
-		ctx->rem_wnd -= frame_hdr.length;
-		stream->rem_wnd -= frame_hdr.length;
+		T_FSM_JMP(HTTP2_SEND_FRAMES);
 	}
-	stream->xmit.processed += frame_hdr.length + FRAME_HEADER_SIZE;
-	*len -= frame_hdr.length;
 
-	return 0;
-}
+	T_FSM_STATE(HTTP2_MAKE_CONTINUATION_FRAMES) {
+		CALC_SND_WND_AND_SET_FRAME_TYPE(HTTP2_CONTINUATION);
+		r = tfw_h2_insert_frame_header(sk, ctx, stream, frame_type,
+					       snd_wnd, stream->xmit.h_len);
+		if (unlikely(r)) {
+			T_WARN("Failed to make continuation frame %d", r);
+			return r;
+		}
 
-int
-tfw_h2_make_headers_frames(struct sock *sk, struct sk_buff *skb,
-			   TfwH2Ctx *ctx, TfwStream *stream,
-			   unsigned int mss_now, unsigned int limit,
-			   unsigned int *t_tz)
-{
-	TfwFrameType type = skb_tfw_flags(skb) & SS_F_HTTP2_FRAME_START ?
-		HTTP2_HEADERS : HTTP2_CONTINUATION;
+		T_FSM_JMP(HTTP2_SEND_FRAMES);
+	}
 
-	return tfw_h2_make_frames(sk, skb, ctx, stream, type,
-				  mss_now, limit, t_tz);
-}
+	T_FSM_STATE(HTTP2_MAKE_DATA_FRAMES) {
+		if (ctx->rem_wnd <= 0 || stream->rem_wnd <= 0) {
+			ctx->sched.blocked_streams +=
+				(stream->rem_wnd <= 0
+				 && !stream->xmit.is_blocked);
+			stream->xmit.is_blocked = stream->rem_wnd <= 0;
+			T_FSM_EXIT();
+		}
 
-int
-tfw_h2_make_data_frames(struct sock *sk, struct sk_buff *skb,
-			TfwH2Ctx *ctx, TfwStream *stream,
-			unsigned int mss_now, unsigned int limit,
-			unsigned int *t_tz)
-{
-	return tfw_h2_make_frames(sk, skb, ctx, stream, HTTP2_DATA,
-				  mss_now, limit, t_tz);
+		CALC_SND_WND_AND_SET_FRAME_TYPE(HTTP2_DATA);
+		r = tfw_h2_insert_frame_header(sk, ctx, stream, frame_type,
+					       snd_wnd, stream->xmit.b_len);
+		if (unlikely (r)) {
+			T_WARN("Failed to make data frame %d", r);
+			return r;
+		}
+
+		fallthrough;
+	}
+
+	T_FSM_STATE(HTTP2_SEND_FRAMES) {
+		r =  tfw_h2_entail_stream_skb(sk, ctx, stream,
+					      &stream->xmit.frame_length,
+					      false);
+		if (unlikely(r)) {
+			T_WARN("Failed to send frame %d", r);
+			return r;
+		}
+
+		if (stream->xmit.h_len) {
+			T_FSM_JMP(HTTP2_MAKE_CONTINUATION_FRAMES);
+		} else {
+			if (unlikely(stream->xmit.postponed) &&
+			    !stream->xmit.frame_length)
+				ss_skb_tcp_entail_list(sk,
+						       &stream->xmit.postponed);
+			if (stream->xmit.b_len) {
+				T_FSM_JMP(HTTP2_MAKE_DATA_FRAMES);
+			} else {
+				fallthrough;
+			}
+		}
+	}
+
+	T_FSM_STATE(HTTP2_MAKE_FRAMES_FINISH) {
+		BUG_ON(stream->xmit.resp);
+		/*
+		 * skb_head is not empty because RST stream or
+		 * GOAWAY and TLS ALERT are pending until error
+		 * response is sent.
+		 */
+		if (unlikely(stream->xmit.skb_head)) {
+			ss_skb_tcp_entail_list(sk, &stream->xmit.skb_head);
+			/*
+			 * We set ctx->error only when we close connection
+			 * after sending error response. If ss_action is
+			 * SS_CLOSE we don't need to shutdown socket, because
+			 * we will done it from `ss_do_close`.
+			 */
+			if (stream == ctx->error && ss_action != SS_CLOSE)
+				tcp_shutdown(sk, SEND_SHUTDOWN);
+		}
+		tfw_h2_stream_add_closed(ctx, stream);
+		if (stream == ctx->error)
+			ctx->error = NULL;
+		T_FSM_EXIT();
+	}
+
+	}
+
+	T_FSM_FINISH(r, stream->xmit.state);
+
+	if (stream->xmit.frame_length) {
+		r = tfw_h2_entail_stream_skb(sk, ctx, stream,
+					     &stream->xmit.frame_length,
+					     true);
+	}
+
+
+	return r;
+
+#undef CALC_SND_WND_AND_SET_FRAME_TYPE
 }
 
-TfwStream *
-tfw_h2_find_not_closed_stream(TfwH2Ctx *ctx, unsigned int id,
-					 bool recv)
+int
+tfw_h2_make_frames(struct sock *sk, TfwH2Ctx *ctx, unsigned long snd_wnd,
+		   int ss_action, bool *data_is_available)
 {
+	TfwStreamSched *sched = &ctx->sched;
+	TfwStreamSchedEntry *parent;
 	TfwStream *stream;
+	u64 deficit;
+	bool error_was_sent = false;
+	int r = 0;
+
+	while (tfw_h2_stream_sched_is_active(&sched->root)
+	       && snd_wnd > FRAME_HEADER_SIZE + TLS_MAX_OVERHEAD
+	       && ctx->rem_wnd > 0)
+	{
+		if (ctx->cur_send_headers) {
+			stream = ctx->cur_send_headers;
+			parent = stream->sched.parent;
+			tfw_h2_stream_sched_remove(sched, stream);
+		} else if (ctx->error) {
+			stream = ctx->error;
+			parent = stream->sched.parent;
+			tfw_h2_stream_sched_remove(sched, stream);
+			error_was_sent = true;
+		} else {
+			stream = tfw_h2_sched_stream_dequeue(sched, &parent);
+		}
+
+		/*
+		 * If root scheduler is active we always can find
+		 * active stream.
+		 */
+		BUG_ON(!stream);
+		r = tfw_h2_stream_xmit_process(sk, ctx, stream, ss_action,
+					       &snd_wnd);
+		deficit = tfw_h2_stream_recalc_deficit(stream);
+		tfw_h2_sched_stream_enqueue(sched, stream, parent, deficit);
+
+		/*
+		 * If we send error response we stop to send any data
+		 * from other streams, so we either sent all error response
+		 * or blocked by window size.
+		 */
+		if (error_was_sent || r)
+			break;
+	}
+
+	*data_is_available =
+		tfw_h2_stream_sched_is_active(&sched->root) && ctx->rem_wnd;
 
-	stream = tfw_h2_find_stream(&ctx->sched, id);
 	/*
-	 * RFC 9113 section 5.1:
-	 * An endpoint that sends a RST_STREAM frame on a stream that is in
-	 * the "open" or "half-closed (local)" state could receive any type
-	 * of frame.  The peer might have sent or enqueued for sending these
-	 * frames before processing the RST_STREAM frame.
-	 * It is HTTP2_STREAM_LOC_CLOSED state in our implementation.
+	 * Send shutdown if there is no pending error response in our scheduler
+	 * and this function is called from `ss_do_shutdown`.
 	 */
-        if (!stream || (stream->queue == &ctx->closed_streams
-                        && (!recv || tfw_h2_get_stream_state(stream) >
-			    HTTP2_STREAM_LOC_CLOSED)))
-		return NULL;
+	if ((!ctx->error || r) && ss_action == SS_SHUTDOWN)
+		tcp_shutdown(sk, SEND_SHUTDOWN);
 
-	return stream;
+	return r;
 }
diff --git a/fw/http_frame.h b/fw/http_frame.h
index 57378ed03..bca9e1f1b 100644
--- a/fw/http_frame.h
+++ b/fw/http_frame.h
@@ -53,6 +53,7 @@ typedef enum {
  * section 6.5.2).
  */
 typedef enum {
+	HTTP2_SETTINGS_NEED_TO_APPLY	= 0x00,
 	HTTP2_SETTINGS_TABLE_SIZE	= 0x01,
 	HTTP2_SETTINGS_ENABLE_PUSH,
 	HTTP2_SETTINGS_MAX_STREAMS,
@@ -127,149 +128,39 @@ typedef struct {
 } TfwFramePri;
 
 /**
- * Representation of SETTINGS parameters for HTTP/2 connection (RFC 7540
- * section 6.5.2).
- *
- * @hdr_tbl_sz		- maximum size of the endpoint's header compression
- *			  table used to decode header blocks;
- * @push		- enable/disable indicator for server push;
- * @max_streams		- maximum number of streams that the endpoint will
- *			  allow;
- * @wnd_sz		- endpoint's initial window size for stream-level
- *			  flow control;
- * @max_frame_sz	- size of the largest frame payload the endpoint wish
- *			  to receive;
- * @max_lhdr_sz		- maximum size of header list the endpoint prepared
- *			  to accept;
- */
-typedef struct {
-	unsigned int hdr_tbl_sz;
-	unsigned int push;
-	unsigned int max_streams;
-	unsigned int wnd_sz;
-	unsigned int max_frame_sz;
-	unsigned int max_lhdr_sz;
-} TfwSettings;
-
-/**
- * Context for HTTP/2 frames processing.
- *
- * @lock		- spinlock to protect stream-request linkage;
- * @lsettings		- local settings for HTTP/2 connection;
- * @rsettings		- settings for HTTP/2 connection received from the
- *			  remote endpoint;
- * @streams_num		- number of the streams initiated by client;
- * @sched		- streams' priority scheduler;
- * @closed_streams	- queue of closed streams (in HTTP2_STREAM_CLOSED or
- * 			  HTTP2_STREAM_REM_CLOSED state), which are waiting
- * 			  for removal;
- * @lstream_id		- ID of last stream initiated by client and processed on
- *			  the server side;
- * @loc_wnd		- connection's current flow controlled window;
- * @rem_wnd		- remote peer current flow controlled window;
- * @hpack		- HPACK context, used in processing of
- *			  HEADERS/CONTINUATION frames;
- * @cur_send_headers	- stream for which we have already started sending
- *			  headers, but have not yet sent the END_HEADERS flag;
- * @cur_recv_headers	- stream for which we have already started receiving
- *			  headers, but have not yet received the END_HEADERS
- *			  flag;
- * @sent_settings	- the settings were sent, when ack will be received
- * 			  we should apply these local settings.
- * @__off		- offset to reinitialize processing context;
- * @skb_head		- collected list of processed skbs containing HTTP/2
- *			  frames;
- * @cur_stream		- found stream for the frame currently being processed;
- * @priority		- unpacked data from priority part of payload of
- *			  processed HEADERS or PRIORITY frames;
- * @hdr			- unpacked data from header of currently processed
- *			  frame;
- * @plen		- payload length of currently processed frame
- *			  (HEADERS/CONTINUATION/DATA frames);
- * @state		- current FSM state of HTTP/2 processing context;
- * @to_read		- indicates how much data of HTTP/2 frame should
- *			  be read on next FSM @state;
- * @rlen		- length of accumulated data in @rbuf
- *			  or length of the payload read in current FSM state;
- * @rbuf		- buffer for data accumulation from frames headers and
- *			  payloads (for service frames) during frames
- *			  processing;
- * @padlen		- length of current frame's padding (if exists);
- * @data_off		- offset of app data in HEADERS, CONTINUATION and DATA
- *			  frames (after all service payloads);
- * @new_settings	- struct which contains flags and new settings, which
- *			  should be applyed in `xmit` callback. Currently it
- *			  is used only for new hpack dynamic table size, but
- *			  can be wide later.
- *
- * NOTE: we can keep HPACK context in general connection-wide HTTP/2 context
- * (instead of separate HPACK context for each stream), since frames from other
- * streams cannot occur between the HEADERS/CONTINUATION frames of particular
- * stream (RFC 7540, sections 6.2, 6.10, 8.1).
+ * FSM states for HTTP/2 frames processing.
  */
-typedef struct tfw_h2_ctx_t {
-	spinlock_t	lock;
-	TfwSettings	lsettings;
-	TfwSettings	rsettings;
-	unsigned long	streams_num;
-	TfwStreamSched	sched;
-	TfwStreamQueue	closed_streams;
-	unsigned int	lstream_id;
-	long int	loc_wnd;
-	long int	rem_wnd;
-	TfwHPack	hpack;
-	TfwStream	*cur_send_headers;
-	TfwStream	*cur_recv_headers;
-	bool		sent_settings[_HTTP2_SETTINGS_MAX];
-	char		__off[0];
-	struct sk_buff	*skb_head;
-	TfwStream	*cur_stream;
-	TfwFramePri	priority;
-	TfwFrameHdr	hdr;
-	unsigned int	plen;
-	int		state;
-	int		to_read;
-	int		rlen;
-	unsigned char	rbuf[FRAME_HEADER_SIZE];
-	unsigned char	padlen;
-	unsigned char	data_off;
-	struct {
-		unsigned short flags;
-		unsigned int hdr_tbl_sz;
-	} new_settings;
-} TfwH2Ctx;
+typedef enum {
+	HTTP2_RECV_FRAME_HEADER,
+	HTTP2_RECV_CLI_START_SEQ,
+	HTTP2_RECV_FIRST_SETTINGS,
+	HTTP2_RECV_FRAME_PRIORITY,
+	HTTP2_RECV_FRAME_WND_UPDATE,
+	HTTP2_RECV_FRAME_PING,
+	HTTP2_RECV_FRAME_RST_STREAM,
+	HTTP2_RECV_FRAME_SETTINGS,
+	HTTP2_RECV_FRAME_GOAWAY,
+	HTTP2_RECV_FRAME_PADDED,
+	HTTP2_RECV_HEADER_PRI,
+	HTTP2_IGNORE_FRAME_DATA,
+	__HTTP2_RECV_FRAME_APP,
+	HTTP2_RECV_HEADER		= __HTTP2_RECV_FRAME_APP,
+	HTTP2_RECV_CONT,
+	HTTP2_RECV_DATA,
+	HTTP2_RECV_APP_DATA_POST
+} TfwFrameState;
+
+#define MAX_WND_SIZE			((1U << 31) - 1)
+#define DEF_WND_SIZE			((1U << 16) - 1)
 
 typedef struct tfw_conn_t TfwConn;
 
-int tfw_h2_init(void);
-void tfw_h2_cleanup(void);
-int tfw_h2_context_init(TfwH2Ctx *ctx);
-void tfw_h2_context_clear(TfwH2Ctx *ctx);
 int tfw_h2_frame_process(TfwConn *c, struct sk_buff *skb,
 			 struct sk_buff **next);
-void tfw_h2_conn_streams_cleanup(TfwH2Ctx *ctx);
-TfwStream *tfw_h2_find_not_closed_stream(TfwH2Ctx *ctx, unsigned int id,
-					 bool recv);
-unsigned int tfw_h2_req_stream_id(TfwHttpReq *req);
-void tfw_h2_req_unlink_stream(TfwHttpReq *req);
-void tfw_h2_req_unlink_stream_with_rst(TfwHttpReq *req);
-void tfw_h2_conn_terminate_close(TfwH2Ctx *ctx, TfwH2Err err_code, bool close,
-				 bool attack);
 int tfw_h2_send_rst_stream(TfwH2Ctx *ctx, unsigned int id, TfwH2Err err_code);
-
-int tfw_h2_make_headers_frames(struct sock *sk, struct sk_buff *skb,
-			       TfwH2Ctx *ctx, TfwStream *stream,
-			       unsigned int mss_now, unsigned int limit,
-			       unsigned int *t_tz);
-int tfw_h2_make_data_frames(struct sock *sk, struct sk_buff *skb,
-			    TfwH2Ctx *ctx, TfwStream *stream,
-			    unsigned int mss_now, unsigned int limit,
-			    unsigned int *t_tz);
-int tfw_h2_insert_frame_header(struct sock *sk,  struct sk_buff *skb,
-			       TfwStream *stream, unsigned int mss_now,
-			       TfwMsgIter *it, char **data,
-			       const TfwStr *frame_hdr_str,
-			       unsigned int *t_tz);
+int tfw_h2_send_goaway(TfwH2Ctx *ctx, TfwH2Err err_code, bool attack);
+int tfw_h2_make_frames(struct sock *sk, TfwH2Ctx *ctx, unsigned long smd_wnd,
+		       int ss_action, bool *data_is_available);
 
 static inline void
 tfw_h2_pack_frame_header(unsigned char *p, const TfwFrameHdr *hdr)
@@ -299,13 +190,4 @@ tfw_h2_unpack_frame_header(TfwFrameHdr *hdr, const unsigned char *buf)
 	       __func__, hdr->length, hdr->stream_id, hdr->type, hdr->flags);
 }
 
-static inline void
-tfw_h2_conn_reset_stream_on_close(TfwH2Ctx *ctx, TfwStream *stream)
-{
-	if (ctx->cur_send_headers == stream)
-		ctx->cur_send_headers = NULL;
-	if (ctx->cur_recv_headers == stream)
-		ctx->cur_recv_headers = NULL;
-}
-
 #endif /* __HTTP_FRAME__ */
diff --git a/fw/http_msg.c b/fw/http_msg.c
index aea94d5ec..16ce45bd4 100644
--- a/fw/http_msg.c
+++ b/fw/http_msg.c
@@ -1472,8 +1472,15 @@ __tfw_http_msg_move_body(TfwHttpResp *resp, struct sk_buff *nskb)
 	return 0;
 }
 
-static inline int
-__tfw_http_msg_linear_transform(TfwMsgIter *it)
+/*
+ * Move linear data to paged fragment before inserting data into skb.
+ * We must do it, because we want to insert new data "before" linear.
+ * For instance: We want to insert headers. Linear data contains part
+ * of the body, if we insert headers without moving linear part,
+ * headers will be inserted after the body or between the body chunks.
+ */
+int
+tfw_http_msg_linear_transform(TfwMsgIter *it)
 {
 	/*
 	 * There is no sense to move linear part if next skb has linear
@@ -1519,15 +1526,8 @@ __tfw_http_msg_expand_from_pool(TfwHttpResp *resp, const TfwStr *str,
 
 	BUG_ON(it->skb->len > SS_SKB_MAX_DATA_LEN);
 
-	/*
-	 * Move linear data to paged fragment before inserting data into skb.
-	 * We must do it, because we want to insert new data "before" linear.
-	 * For instance: We want to insert headers. Linear data contains part
-	 * of the body, if we insert headers without moving linear part,
-	 * headers will be inserted after the body or between the body chunks.
-	 */
 	if (skb_headlen(it->skb)) {
-		if (unlikely((r = __tfw_http_msg_linear_transform(it))))
+		if (unlikely((r = tfw_http_msg_linear_transform(it))))
 			return r;
 	}
 
diff --git a/fw/http_msg.h b/fw/http_msg.h
index 619d7dd82..e58ce2ff8 100644
--- a/fw/http_msg.h
+++ b/fw/http_msg.h
@@ -122,7 +122,7 @@ tfw_h2_msg_transform_setup(TfwHttpTransIter *mit, struct sk_buff *skb,
 
 static inline int
 tfw_h2_msg_hdr_add(TfwHttpResp *resp, char *name, size_t nlen, char *val,
-		   size_t vlen, unsigned short idx, unsigned int stream_id)
+		   size_t vlen, unsigned short idx)
 {
 	TfwStr hdr = {
 		.chunks = (TfwStr []){
@@ -134,7 +134,7 @@ tfw_h2_msg_hdr_add(TfwHttpResp *resp, char *name, size_t nlen, char *val,
 		.hpack_idx = idx
 	};
 
-	return tfw_hpack_encode(resp, &hdr, true, true, stream_id);
+	return tfw_hpack_encode(resp, &hdr, true, true);
 }
 
 int __must_check __tfw_http_msg_add_str_data(TfwHttpMsg *hm, TfwStr *str,
@@ -178,9 +178,10 @@ int __hdr_name_cmp(const TfwStr *hdr, const TfwStr *cmp_hdr);
 int __http_hdr_lookup(TfwHttpMsg *hm, const TfwStr *hdr);
 int tfw_h2_msg_cutoff_headers(TfwHttpResp *resp, TfwHttpRespCleanup* cleanup);
 int tfw_http_msg_insert(TfwMsgIter *it, char **off, const TfwStr *data);
+int tfw_http_msg_linear_transform(TfwMsgIter *it);
 
-#define TFW_H2_MSG_HDR_ADD(hm, name, val, idx, stream_id)		\
+#define TFW_H2_MSG_HDR_ADD(hm, name, val, idx)				\
 	tfw_h2_msg_hdr_add(hm, name, sizeof(name) - 1, val,		\
-			   sizeof(val) - 1, idx, stream_id)
+			   sizeof(val) - 1, idx)
 
 #endif /* __TFW_HTTP_MSG_H__ */
diff --git a/fw/http_sess.c b/fw/http_sess.c
index 426508f27..8e8392e1e 100644
--- a/fw/http_sess.c
+++ b/fw/http_sess.c
@@ -329,7 +329,7 @@ tfw_http_sticky_calc(TfwHttpReq *req, StickyVal *sv)
  * to the HTTP response' header block.
  */
 static int
-tfw_http_sticky_add(TfwHttpResp *resp, bool cache, unsigned int stream_id)
+tfw_http_sticky_add(TfwHttpResp *resp, bool cache)
 {
 	int r;
 	static const unsigned int len = sizeof(StickyVal) * 2;
@@ -362,8 +362,7 @@ tfw_http_sticky_add(TfwHttpResp *resp, bool cache, unsigned int stream_id)
 
 	if (to_h2) {
 		set_cookie.hpack_idx = 55;
-		r = tfw_hpack_encode(resp, &set_cookie, !cache, !cache,
-				     stream_id);
+		r = tfw_hpack_encode(resp, &set_cookie, !cache, !cache);
 	}
 	else if (cache) {
 		TfwHttpTransIter *mit = &resp->mit;
@@ -577,8 +576,7 @@ tfw_http_sticky_req_process(TfwHttpReq *req, StickyVal *sv, TfwStr *cookie_val)
  * Add Tempesta sticky cookie to an HTTP response if needed.
  */
 int
-tfw_http_sess_resp_process(TfwHttpResp *resp, bool cache,
-			   unsigned int stream_id)
+tfw_http_sess_resp_process(TfwHttpResp *resp, bool cache)
 {
 	TfwHttpReq *req = resp->req;
 	TfwStickyCookie *sticky = req->vhost->cookie;
@@ -600,7 +598,7 @@ tfw_http_sess_resp_process(TfwHttpResp *resp, bool cache,
 	 */
 	if (test_bit(TFW_HTTP_B_HAS_STICKY, req->flags))
 		return 0;
-	return tfw_http_sticky_add(resp, cache, stream_id);
+	return tfw_http_sticky_add(resp, cache);
 }
 
 /**
diff --git a/fw/http_sess.h b/fw/http_sess.h
index ea996b286..802dc6e38 100644
--- a/fw/http_sess.h
+++ b/fw/http_sess.h
@@ -178,8 +178,7 @@ enum {
 
 int tfw_http_sess_obtain(TfwHttpReq *req);
 int tfw_http_sess_learn(TfwHttpResp *resp);
-int tfw_http_sess_resp_process(TfwHttpResp *resp, bool cache,
-			       unsigned int stream_id);
+int tfw_http_sess_resp_process(TfwHttpResp *resp, bool cache);
 void tfw_http_sess_put(TfwHttpSess *sess);
 void tfw_http_sess_pin_vhost(TfwHttpSess *sess, TfwVhost *vhost);
 
diff --git a/fw/http_stream.c b/fw/http_stream.c
index 1104fedb8..e44792b31 100644
--- a/fw/http_stream.c
+++ b/fw/http_stream.c
@@ -47,33 +47,22 @@ tfw_h2_stream_cache_destroy(void)
 	kmem_cache_destroy(stream_cache);
 }
 
-static int
-tfw_h2_find_stream_dep(TfwStreamSched *sched, unsigned int id, TfwStream **dep)
-{
-	/*
-	 * TODO: implement dependency/priority logic (according to RFC 7540
-	 * section 5.3) in context of #1196.
-	 */
-	return 0;
-}
 
-static void
-tfw_h2_add_stream_dep(TfwStreamSched *sched, TfwStream *stream, TfwStream *dep,
-		      bool excl)
+static inline void
+tfw_h2_conn_reset_stream_on_close(TfwH2Ctx *ctx, TfwStream *stream)
 {
-	/*
-	 * TODO: implement dependency/priority logic (according to RFC 7540
-	 * section 5.3) in context of #1196.
-	 */
+	if (ctx->cur_send_headers == stream)
+		ctx->cur_send_headers = NULL;
+	if (ctx->cur_recv_headers == stream)
+		ctx->cur_recv_headers = NULL;
 }
 
-static void
-tfw_h2_remove_stream_dep(TfwStreamSched *sched, TfwStream *stream)
+static inline void
+tfw_h2_stream_purge_all(TfwStream *stream)
 {
-	/*
-	 * TODO: implement dependency/priority logic (according to RFC 7540
-	 * section 5.3) in context of #1196.
-	 */
+	ss_skb_queue_purge(&stream->xmit.skb_head);
+	ss_skb_queue_purge(&stream->xmit.postponed);
+	stream->xmit.h_len = stream->xmit.b_len = 0;
 }
 
 static void
@@ -81,8 +70,15 @@ tfw_h2_stop_stream(TfwStreamSched *sched, TfwStream *stream)
 {
 	TfwH2Ctx *ctx = container_of(sched, TfwH2Ctx, sched);
 
-	tfw_h2_conn_reset_stream_on_close(ctx, stream);
+	/*
+	 * Should be done before purging stream send queue,
+	 * to correct adjusting count of active streams in
+	 * the scheduler.
+	 */
 	tfw_h2_remove_stream_dep(sched, stream);
+	tfw_h2_stream_purge_all_and_free_response(stream);
+
+	tfw_h2_conn_reset_stream_on_close(ctx, stream);
 	rb_erase(&stream->node, &sched->streams);
 }
 
@@ -91,6 +87,9 @@ tfw_h2_init_stream(TfwStream *stream, unsigned int id, unsigned short weight,
 		   long int loc_wnd, long int rem_wnd)
 {
 	RB_CLEAR_NODE(&stream->node);
+	bzero_fast(&stream->sched_node, sizeof(stream->sched_node));
+	stream->sched_state = HTTP2_STREAM_SCHED_STATE_UNKNOWN;
+	tfw_h2_init_stream_sched_entry(&stream->sched);
 	INIT_LIST_HEAD(&stream->hcl_node);
 	spin_lock_init(&stream->st_lock);
 	stream->id = id;
@@ -134,6 +133,84 @@ tfw_h2_add_stream(TfwStreamSched *sched, unsigned int id, unsigned short weight,
 	return new_stream;
 }
 
+void
+tfw_h2_stream_purge_send_queue(TfwStream *stream)
+{
+	unsigned long len = stream->xmit.h_len + stream->xmit.b_len +
+		stream->xmit.frame_length;
+	struct sk_buff *skb;
+
+	while (len) {
+		skb = ss_skb_dequeue(&stream->xmit.skb_head);
+		BUG_ON(!skb);
+
+		len -= skb->len;
+		kfree_skb(skb);
+	}
+	stream->xmit.h_len = stream->xmit.b_len = stream->xmit.frame_length = 0;
+}
+
+void
+tfw_h2_stream_purge_all_and_free_response(TfwStream *stream)
+{
+	TfwHttpResp*resp = stream->xmit.resp;
+
+	if (resp) {
+		tfw_http_resp_pair_free_and_put_conn(resp);
+		stream->xmit.resp = NULL;
+	}
+	tfw_h2_stream_purge_all(stream);
+}
+
+void
+tfw_h2_stream_add_idle(TfwH2Ctx *ctx, TfwStream *idle)
+{
+	TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2);
+	struct list_head *pos, *prev = &ctx->idle_streams.list;
+	bool found = false;
+
+	/*
+	 * We add and remove streams from idle queue under the
+	 * socket lock.
+	 */
+	assert_spin_locked(&((TfwConn *)conn)->sk->sk_lock.slock);
+
+	/*
+	 * Found first idle stream with id less than new idle
+	 * stream, then insert new stream before this stream.
+	 */
+	list_for_each(pos, &ctx->idle_streams.list) {
+		TfwStream *stream = list_entry(pos, TfwStream, hcl_node);
+
+		if (idle->id > stream->id) {
+			found = true;
+			break;
+		}
+		prev = &stream->hcl_node;
+	}
+
+	if (found) {
+		list_add(&idle->hcl_node, prev);
+		idle->queue = &ctx->idle_streams;
+		++idle->queue->num;
+	} else {
+		tfw_h2_stream_add_to_queue_nolock(&ctx->idle_streams, idle);
+	}
+}
+
+void
+tfw_h2_stream_remove_idle(TfwH2Ctx *ctx, TfwStream *stream)
+{
+	TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2);
+
+	/*
+	 * We add and remove streams from idle queue under the
+	 * socket lock.
+	 */
+	assert_spin_locked(&((TfwConn *)conn)->sk->sk_lock.slock);
+	tfw_h2_stream_del_from_queue_nolock(stream);
+}
+
 /*
  * Create a new stream and add it to the streams storage and to the dependency
  * tree. Note, that we do not need to protect the streams storage in @sched from
@@ -143,13 +220,12 @@ tfw_h2_add_stream(TfwStreamSched *sched, unsigned int id, unsigned short weight,
 TfwStream *
 tfw_h2_stream_create(TfwH2Ctx *ctx, unsigned int id)
 {
-	TfwStream *stream, *dep = NULL;
+	TfwStream *stream;
+	TfwStreamSchedEntry *dep = NULL;
 	TfwFramePri *pri = &ctx->priority;
 	bool excl = pri->exclusive;
 
-	if (tfw_h2_find_stream_dep(&ctx->sched, pri->stream_id, &dep))
-		return NULL;
-
+	dep = tfw_h2_find_stream_dep(&ctx->sched, pri->stream_id);
 	stream = tfw_h2_add_stream(&ctx->sched, id, pri->weight,
 				   ctx->lsettings.wnd_sz,
 				   ctx->rsettings.wnd_sz);
@@ -160,10 +236,10 @@ tfw_h2_stream_create(TfwH2Ctx *ctx, unsigned int id)
 
 	++ctx->streams_num;
 
-	T_DBG3("%s: ctx [%p] (streams_num %lu, dep strm id %u, dep strm [%p], excl %u)\n"
-	       "added strm [%p] id %u weight %u\n",
-	       __func__, ctx, ctx->streams_num, pri->stream_id, dep, pri->exclusive,
-	       stream, id, stream->weight);
+	T_DBG3("%s: ctx [%p] (streams_num %lu, dep strm id %u, dep strm [%p],"
+	       "excl %u) added strm [%p] id %u weight %u\n",
+	       __func__, ctx, ctx->streams_num, pri->stream_id, dep,
+	       pri->exclusive, stream, id, stream->weight);
 
 	return stream;
 }
@@ -228,40 +304,6 @@ tfw_h2_stream_add_closed(TfwH2Ctx *ctx, TfwStream *stream)
 	spin_unlock(&ctx->lock);
 }
 
-/*
- * Stream closing procedure: move the stream into special queue of closed
- * streams and send RST_STREAM frame to peer. This procedure is intended
- * for usage only in receiving flow of Framing layer, thus the stream is
- * definitely alive here and we need not any unlinking operations since
- * all the unlinking and cleaning work will be made later, during shrinking
- * the queue of closed streams; thus, we just move the stream into the
- * closed queue here.
- * We also reset the current stream of the H2 context here.
- */
-int
-tfw_h2_stream_close(TfwH2Ctx *ctx, unsigned int id, TfwStream **stream,
-		    TfwH2Err err_code)
-{
-	if (stream && *stream) {
-		T_DBG3("%s: ctx [%p] strm %p id %d err %u\n", __func__,
-			ctx, *stream, id, err_code);
-		tfw_h2_conn_reset_stream_on_close(ctx, *stream);
-		if (tfw_h2_get_stream_state(*stream) >
-		    HTTP2_STREAM_REM_HALF_CLOSED) {
-			tfw_h2_stream_add_closed(ctx, *stream);
-		} else {
-			/*
-			 * This function is always called after processing
-			 * RST STREAM or stream error.
-			 */
-			BUG();
-		}
-		*stream = NULL;
-	}
-
-	return tfw_h2_send_rst_stream(ctx, id, err_code);
-}
-
 /*
  * Stream FSM processing during frames receipt (see RFC 7540 section
  * 5.1 for details).
@@ -434,12 +476,6 @@ do {									\
 			break;
 		}
 
-		if (send) {
-			TFW_H2_FSM_TYPE_CHECK(ctx, stream, send, type);
-		} else {
-			TFW_H2_FSM_TYPE_CHECK(ctx, stream, recv, type);
-		}
-
 		if (type == HTTP2_CONTINUATION) {
 			/*
 			 * Empty CONTINUATION frames without END_HEADERS flag
@@ -572,6 +608,7 @@ do {									\
 					   HTTP2_F_END_STREAM))
 				{
 				case HTTP2_F_END_HEADERS | HTTP2_F_END_STREAM:
+					ctx->cur_recv_headers = NULL;
 					SET_STATE(HTTP2_STREAM_CLOSED);
 					break;
 				case HTTP2_F_END_HEADERS:
@@ -582,8 +619,9 @@ do {									\
 					ctx->cur_recv_headers = NULL;
 					break;
 				case HTTP2_F_END_STREAM:
-					SET_STATE(HTTP2_STREAM_CLOSED);
-					ctx->cur_recv_headers = NULL;
+					ctx->cur_recv_headers = stream;
+					stream->state |=
+						HTTP2_STREAM_RECV_END_OF_STREAM;
 					break;
 				default:
 					ctx->cur_recv_headers = stream;
@@ -616,9 +654,46 @@ do {									\
 
 	case HTTP2_STREAM_REM_HALF_CLOSED:
 		if (send) {
-			if (type == HTTP2_RST_STREAM
-			    || flags & HTTP2_F_END_STREAM)
+			if (type == HTTP2_HEADERS ||
+			    type == HTTP2_CONTINUATION) {
+				switch (flags
+					& (HTTP2_F_END_HEADERS |
+					   HTTP2_F_END_STREAM))
+				{
+				/*
+				 * RFC 9113 5.1 (half-closed (remote) state):
+				 * A stream can transition from this state to
+				 * "closed" by sending a frame with the
+				 * END_STREAM flag set.
+				 */
+				case HTTP2_F_END_STREAM:
+					ctx->cur_send_headers = stream;
+					stream->state |=
+						HTTP2_STREAM_SEND_END_OF_STREAM;
+					break;
+				case HTTP2_F_END_HEADERS | HTTP2_F_END_STREAM:
+					ctx->cur_send_headers = NULL;
+					SET_STATE(HTTP2_STREAM_CLOSED);
+					break;
+				case HTTP2_F_END_HEADERS:
+					/*
+					 * Headers are ended, next frame in the
+					 * stream should be DATA frame.
+					 */
+					ctx->cur_send_headers = NULL;
+					break;
+
+				default:
+					ctx->cur_send_headers = stream;
+					break;
+				}
+			} else if (type == HTTP2_DATA) {
+				if (flags & HTTP2_F_END_STREAM)
+					SET_STATE(HTTP2_STREAM_CLOSED);
+			} else if (type == HTTP2_RST_STREAM) {
 				SET_STATE(HTTP2_STREAM_REM_CLOSED);
+			}
+
 			break;
 		}
 
@@ -639,9 +714,9 @@ do {									\
 			/*
 			 * We always send RST_STREAM to the peer in this case;
 			 * thus, the stream should be switched to the
-			 * 'closed (remote)' state.
+			 * 'closed' state.
 			 */
-			SET_STATE(HTTP2_STREAM_REM_CLOSED);
+			SET_STATE(HTTP2_STREAM_CLOSED);
 			*err = HTTP2_ECODE_CLOSED;
 			res = STREAM_FSM_RES_TERM_STREAM;
 		}
@@ -654,19 +729,23 @@ do {									\
 	 * frame on a stream in the "open" or "half-closed (local)" state.
 	 */
 	case HTTP2_STREAM_LOC_CLOSED:
+		if (send) {
+			res = STREAM_FSM_RES_IGNORE;
+			break;
+		}
+
 		/*
 		 * RFC 9113 section 5.1:
 		 * An endpoint that sends a RST_STREAM frame on a stream
 		 * that is in the "open" or "half-closed (local)" state
 		 * could receive any type of frame.
+		 * An endpoint MUST minimally process and then discard
+		 * any frames it receives in this state.
 		 */
-		if (send) {
-			res = STREAM_FSM_RES_IGNORE;
-			break;
-		}
-
 		if (type == HTTP2_RST_STREAM)
 			SET_STATE(HTTP2_STREAM_CLOSED);
+		else if (type != HTTP2_WINDOW_UPDATE)
+			res = STREAM_FSM_RES_IGNORE;
 
 		break;
 
@@ -701,20 +780,22 @@ do {									\
 		       " flags=0x%hhx\n", __func__, stream->id, type, flags);
 		if (send) {
 			res = STREAM_FSM_RES_IGNORE;
-			break;
+		} else {
+			if (type != HTTP2_PRIORITY) {
+				*err = HTTP2_ECODE_PROTO;
+				res = STREAM_FSM_RES_TERM_CONN;
+			}
 		}
-		/*
-		 * In moment when the final 'closed' state is achieved, stream
-		 * actually must be removed from stream's storage (and from
-		 * memory), thus the receive execution flow must not reach this
-		 * point.
-		 */
-		fallthrough;
+
+		break;
 	default:
 		BUG();
 	}
 
 finish:
+	if (type == HTTP2_RST_STREAM || res == STREAM_FSM_RES_TERM_STREAM)
+		tfw_h2_conn_reset_stream_on_close(ctx, stream);
+
 	T_DBG3("exit %s: strm [%p] state %d(%s), res %d\n", __func__, stream,
 	       tfw_h2_get_stream_state(stream), __h2_strm_st_n(stream), res);
 
@@ -749,38 +830,38 @@ tfw_h2_find_stream(TfwStreamSched *sched, unsigned int id)
 void
 tfw_h2_delete_stream(TfwStream *stream)
 {
+	BUG_ON(stream->xmit.resp || stream->xmit.skb_head);
 	kmem_cache_free(stream_cache, stream);
 }
 
-void
-tfw_h2_change_stream_dep(TfwStreamSched *sched, unsigned int stream_id,
-			 unsigned int new_dep, unsigned short new_weight,
-			 bool excl)
-{
-	/*
-	 * TODO: implement dependency/priority logic (according to RFC 7540
-	 * section 5.3) in context of #1196.
-	 */
-}
-
 int
-tfw_h2_stream_init_for_xmit(TfwHttpReq *req, unsigned long h_len,
-			    unsigned long b_len)
+tfw_h2_stream_init_for_xmit(TfwHttpResp *resp, TfwStreamXmitState state,
+			    unsigned long h_len, unsigned long b_len)
 {
-	TfwH2Ctx *ctx = tfw_h2_context_unsafe(req->conn);
+	TfwH2Ctx *ctx = tfw_h2_context_unsafe(resp->req->conn);
+	struct sk_buff *skb_head = resp->msg.skb_head;
 	TfwStream *stream;
 
 	spin_lock(&ctx->lock);
 
-	stream = req->stream;
+	stream = resp->req->stream;
 	if (!stream) {
 		spin_unlock(&ctx->lock);
 		return -EPIPE;
 	}
 
+	TFW_SKB_CB(skb_head)->opaque_data = resp;
+	TFW_SKB_CB(skb_head)->destructor = tfw_http_resp_pair_free_and_put_conn;
+	TFW_SKB_CB(skb_head)->on_send = tfw_http_on_send_resp;
+	TFW_SKB_CB(skb_head)->stream_id = stream->id;
+
+	stream->xmit.resp = NULL;
+	stream->xmit.skb_head = NULL;
 	stream->xmit.h_len = h_len;
 	stream->xmit.b_len = b_len;
-	tfw_h2_stream_xmit_reinit(&stream->xmit);
+	stream->xmit.state = state;
+	stream->xmit.frame_length = 0;
+	stream->xmit.is_blocked = false;
 
 	spin_unlock(&ctx->lock);
 
@@ -790,21 +871,18 @@ tfw_h2_stream_init_for_xmit(TfwHttpReq *req, unsigned long h_len,
 TfwStreamFsmRes
 tfw_h2_stream_send_process(TfwH2Ctx *ctx, TfwStream *stream, unsigned char type)
 {
-	TfwStreamFsmRes r;
 	unsigned char flags = 0;
 
-	BUG_ON(stream->xmit.h_len && stream->xmit.b_len);
+	if (stream->xmit.h_len && !stream->xmit.b_len
+	    && type == HTTP2_HEADERS)
+		flags |= HTTP2_F_END_STREAM;
 
 	if (!stream->xmit.h_len && type != HTTP2_DATA)
 		flags |= HTTP2_F_END_HEADERS;
 
-	if (!stream->xmit.b_len)
+	if (!stream->xmit.h_len && !stream->xmit.b_len
+	    && !tfw_h2_stream_is_eos_sent(stream))
 		flags |= HTTP2_F_END_STREAM;
 
-	r = tfw_h2_stream_fsm_ignore_err(ctx, stream, type, flags);
-	if (flags & HTTP2_F_END_STREAM
-	    || (r && r != STREAM_FSM_RES_IGNORE))
-		tfw_h2_stream_add_closed(ctx, stream);
-
-	return r != STREAM_FSM_RES_IGNORE ? r : STREAM_FSM_RES_OK;
+	return tfw_h2_stream_fsm_ignore_err(ctx, stream, type, flags);
 }
diff --git a/fw/http_stream.h b/fw/http_stream.h
index 7d1de29d3..0064ce074 100644
--- a/fw/http_stream.h
+++ b/fw/http_stream.h
@@ -20,11 +20,11 @@
 #ifndef __HTTP_STREAM__
 #define __HTTP_STREAM__
 
-#include <linux/rbtree.h>
-
 #include "msg.h"
 #include "http_parser.h"
+#include "http_stream_sched.h"
 #include "lib/str.h"
+#include "ss_skb.h"
 
 /**
  * States for HTTP/2 streams processing.
@@ -32,9 +32,7 @@
  * NOTE: there is no exact matching between these states and states from
  * RFC 7540 (section 5.1), since several intermediate states were added in
  * current implementation to handle some edge states which are not mentioned
- * explicitly in RFC (special kinds of closed state). Besides, there is no
- * explicit 'idle' state here, since in current implementation idle stream
- * is just a stream that has not been created yet.
+ * explicitly in RFC (special kinds of closed state).
  */
 typedef enum {
 	HTTP2_STREAM_IDLE,
@@ -55,6 +53,21 @@ enum {
 	HTTP2_STREAM_RECV_END_OF_STREAM = 0x2 << HTTP2_STREAM_FLAGS_OFFSET,
 };
 
+/*
+ * We use 3 bits for this state in TfwHttpXmit structure.
+ * If you add some new state here, do not forget to increase
+ * count of bits used for this state.
+ */
+typedef enum {
+	HTTP2_ENCODE_HEADERS,
+	HTTP2_RELEASE_RESPONSE,
+	HTTP2_MAKE_HEADERS_FRAMES,
+	HTTP2_MAKE_CONTINUATION_FRAMES,
+	HTTP2_MAKE_DATA_FRAMES,
+	HTTP2_SEND_FRAMES,
+	HTTP2_MAKE_FRAMES_FINISH,
+} TfwStreamXmitState;
+
 static const char *__tfw_strm_st_names[] = {
 	[HTTP2_STREAM_IDLE]		= "HTTP2_STREAM_IDLE",
 	[HTTP2_STREAM_LOC_RESERVED]	= "HTTP2_STREAM_LOC_RESERVED",
@@ -103,31 +116,36 @@ typedef enum {
  * Last http2 response info, used to prepare frames
  * in `xmit` callbacks.
  *
+ * @resp		- responce, that should be sent;
+ * @skb_head		- head of skb list that must be sent;
+ * @postponed		- head of skb list that must be sent
+ *			  after sending headers for this stream;
  * @h_len		- length of headers in http2 response;
+ * @frame_length	- length of current sending frame, or 0
+ *			  if we send some service frames (for
+ *			  example RST STREAM after all pending data);
  * @b_len		- length of body in http2 response;
- * @__off		- offset to reinitialize processing context;
- * @processed		- count of bytes, processed during prepare xmit
- * 			  callback;
- * @nskbs		- count of skbs processed during prepare xmit callback;
+ * @is_blocked		- stream is blocked;
+ * @state		- current stream xmit state (what type of
+ * 			  frame should be made for this stream);
  */
 typedef struct {
-	unsigned long h_len;
-	unsigned long b_len;
-	char __off[0];
-	unsigned int processed;
-	unsigned int nskbs;
+	TfwHttpResp 		*resp;
+	struct sk_buff		*skb_head;
+	struct sk_buff		*postponed;
+	unsigned int		h_len;
+	unsigned int		frame_length;
+	u64			b_len : 60;
+	u64			is_blocked : 1;
+	u64			state : 3;
 } TfwHttpXmit;
 
 /**
- * Limited queue for temporary storage of half-closed or pending half-closed
- * streams.
+ * Limited queue for temporary storage of idle or closed streams
  * This structure provides the possibility of temporary existing in memory -
- * for streams which are in HTTP2_STREAM_LOC_CLOSED or HTTP2_STREAM_REM_CLOSED
- * states (see RFC 7540, section 5.1, the 'closed' paragraph). Note, that
- * streams in HTTP2_STREAM_CLOSED state are not stored in this queue and must
- * be removed right away.
+ * for streams which are in HTTP2_STREAM_CLOSED state.
  *
- * @list		- list of streams which are in closed state;
+ * @list		- list of streams;
  * @num			- number of streams in the list;
  */
 typedef struct {
@@ -135,10 +153,19 @@ typedef struct {
 	unsigned long		num;
 } TfwStreamQueue;
 
+typedef enum {
+	HTTP2_STREAM_SCHED_STATE_UNKNOWN,
+	HTTP2_STREAM_SCHED_STATE_BLOCKED,
+	HTTP2_STREAM_SCHED_STATE_ACTIVE,
+} TfwStreamSchedState;
+
 /**
  * Representation of HTTP/2 stream entity.
  *
  * @node	- entry in per-connection storage of streams (red-black tree);
+ * @sched_node	- entry in per-connection priority storage of active streams;
+ * sched_state	- state of stream in the per-connection scheduler;
+ * @sched	- scheduler for child streams;
  * @hcl_node	- entry in queue of half-closed or closed streams;
  * @id		- stream ID;
  * @state	- stream's current state;
@@ -153,6 +180,9 @@ typedef struct {
  */
 struct tfw_http_stream_t {
 	struct rb_node		node;
+	struct eb64_node	sched_node;
+	TfwStreamSchedState	sched_state;
+	TfwStreamSchedEntry	sched;
 	struct list_head	hcl_node;
 	unsigned int		id;
 	int			state;
@@ -166,26 +196,13 @@ struct tfw_http_stream_t {
 	TfwHttpXmit		xmit;
 };
 
-/**
- * Scheduler for stream's processing distribution based on dependency/priority
- * values.
- * TODO: the structure is not completed yet and should be finished in context
- * of #1196.
- *
- * @streams	- root red-black tree entry for per-connection streams' storage;
- */
-typedef struct {
-	struct rb_root streams;
-} TfwStreamSched;
-
 typedef struct tfw_h2_ctx_t TfwH2Ctx;
 
 int tfw_h2_stream_cache_create(void);
 void tfw_h2_stream_cache_destroy(void);
-TfwStream * tfw_h2_stream_create(TfwH2Ctx *ctx, unsigned int id);
+TfwStream *tfw_h2_stream_create(TfwH2Ctx *ctx, unsigned int id);
+void tfw_h2_stream_remove_idle(TfwH2Ctx *ctx, TfwStream *stream);
 void tfw_h2_stream_clean(TfwH2Ctx *ctx, TfwStream *stream);
-int tfw_h2_stream_close(TfwH2Ctx *ctx, unsigned int id, TfwStream **stream,
-			TfwH2Err err_code);
 void tfw_h2_stream_unlink_nolock(TfwH2Ctx *ctx, TfwStream *stream);
 void tfw_h2_stream_unlink_lock(TfwH2Ctx *ctx, TfwStream *stream);
 TfwStreamFsmRes tfw_h2_stream_fsm(TfwH2Ctx *ctx, TfwStream *stream,
@@ -193,14 +210,14 @@ TfwStreamFsmRes tfw_h2_stream_fsm(TfwH2Ctx *ctx, TfwStream *stream,
 				  bool send, TfwH2Err *err);
 TfwStream *tfw_h2_find_stream(TfwStreamSched *sched, unsigned int id);
 void tfw_h2_delete_stream(TfwStream *stream);
-void tfw_h2_change_stream_dep(TfwStreamSched *sched, unsigned int stream_id,
-			      unsigned int new_dep, unsigned short new_weight,
-			      bool excl);
-int tfw_h2_stream_init_for_xmit(TfwHttpReq *req, unsigned long h_len,
-				unsigned long b_len);
+int tfw_h2_stream_init_for_xmit(TfwHttpResp *resp, TfwStreamXmitState state,
+				unsigned long h_len, unsigned long b_len);
 void tfw_h2_stream_add_closed(TfwH2Ctx *ctx, TfwStream *stream);
+void tfw_h2_stream_add_idle(TfwH2Ctx *ctx, TfwStream *idle);
 TfwStreamFsmRes tfw_h2_stream_send_process(TfwH2Ctx *ctx, TfwStream *stream,
 					   unsigned char type);
+void tfw_h2_stream_purge_send_queue(TfwStream *stream);
+void tfw_h2_stream_purge_all_and_free_response(TfwStream *stream);
 
 static inline TfwStreamState
 tfw_h2_get_stream_state(TfwStream *stream)
@@ -233,10 +250,25 @@ __h2_strm_st_n(TfwStream *stream)
 	return __tfw_strm_st_names[tfw_h2_get_stream_state(stream)];
 }
 
+static inline bool
+tfw_h2_stream_is_active(TfwStream *stream)
+{
+	return stream->xmit.skb_head && !stream->xmit.is_blocked;
+}
+
 static inline void
-tfw_h2_stream_xmit_reinit(TfwHttpXmit *xmit)
+tfw_h2_stream_try_unblock(TfwStreamSched *sched, TfwStream *stream)
 {
-	bzero_fast(xmit->__off, sizeof(*xmit) - offsetof(TfwHttpXmit, __off));
+	bool stream_was_blocked = stream->xmit.is_blocked;
+
+	if (stream->rem_wnd > 0) {
+		stream->xmit.is_blocked = false;
+		if (stream->xmit.skb_head && stream_was_blocked) {
+			sched->blocked_streams--;
+			BUG_ON(sched->blocked_streams < 0);
+			tfw_h2_sched_activate_stream(sched, stream);
+		}
+	}
 }
 
 static inline bool
@@ -297,4 +329,55 @@ tfw_h2_stream_del_from_queue_nolock(TfwStream *stream)
 	stream->queue = NULL;
 }
 
+static inline u64
+tfw_h2_stream_default_deficit(TfwStream *stream)
+{
+	static const unsigned tbl[256] = {
+		65536, 32768, 21845, 16384, 13107, 10922, 9362, 8192, 7281,
+		6553, 5957, 5461, 5041, 4681, 4369, 4096, 3855, 3640, 3449,
+		3276, 3120, 2978, 2849, 2730, 2621, 2520, 2427, 2340, 2259,
+		2184, 2114, 2048, 1985, 1927, 1872, 1820, 1771, 1724, 1680,
+		1638, 1598, 1560, 1524, 1489, 1456, 1424, 1394, 1365, 1337,
+		1310, 1285, 1260, 1236, 1213, 1191, 1170, 1149, 1129, 1110,
+		1092, 1074, 1057, 1040, 1024, 1008, 992, 978, 963, 949, 936,
+		923, 910, 897, 885, 873, 862, 851, 840, 829, 819, 809, 799,
+		789, 780, 771, 762, 753, 744, 736, 728, 720, 712, 704, 697,
+		689, 682, 675, 668, 661, 655, 648, 642, 636, 630, 624, 618,
+		612, 606, 601, 595, 590, 585, 579, 574, 569, 564, 560, 555,
+		550, 546, 541, 537, 532, 528, 524, 520, 516, 512, 508, 504,
+		500, 496, 492, 489, 485, 481, 478, 474, 471, 468, 464, 461,
+		458, 455, 451, 448, 445, 442, 439, 436, 434, 431, 428, 425,
+		422, 420, 417, 414, 412, 409, 407, 404, 402, 399, 397, 394,
+		392, 390, 387, 385, 383, 381, 378, 376, 374, 372, 370, 368,
+		366, 364, 362, 360, 358, 356, 354, 352, 350, 348, 346, 344,
+		343, 341, 339, 337, 336, 334, 332, 330, 329, 327, 326, 324,
+		322, 321, 319, 318, 316, 315, 313, 312, 310, 309, 307, 306,
+		304, 303, 302, 300, 299, 297, 296, 295, 293, 292, 291, 289,
+		288, 287, 286, 284, 283, 282, 281, 280, 278, 277, 276, 275,
+		274, 273, 271, 270, 269, 268, 267, 266, 265, 264, 263, 262,
+		261, 260, 259, 258, 257, 256
+	};
+
+	return tbl[stream->weight - 1];
+}
+
+static inline u64
+tfw_h2_stream_recalc_deficit(TfwStream *stream)
+{
+	/*
+	 * This function should be called only for streams,
+	 * which were removed from scheduler.
+	 */
+	BUG_ON(stream->sched_node.node.leaf_p ||
+	       stream->sched_state != HTTP2_STREAM_SCHED_STATE_UNKNOWN);
+	/* deficit = last_deficit + constant / weight */
+	return stream->sched_node.key + tfw_h2_stream_default_deficit(stream);
+}
+
+static inline bool
+tfw_h2_stream_has_default_deficit(TfwStream *stream)
+{
+	return stream->sched_node.key == tfw_h2_stream_default_deficit(stream);
+}
+
 #endif /* __HTTP_STREAM__ */
diff --git a/fw/http_stream_sched.c b/fw/http_stream_sched.c
new file mode 100644
index 000000000..f4805e497
--- /dev/null
+++ b/fw/http_stream_sched.c
@@ -0,0 +1,616 @@
+/**
+ *		Tempesta FW
+ *
+ * HTTP2 stream scheduler which implements stream prioritization
+ * accoring RFC 7540 5.3.
+ *
+ * There are two algorithm of stream prioritization which are described
+ * in RFC 7540 5.3 and RFC 9218. RFC 7540 5.3 is deprecated, but we
+ * implement our scheduler according to RFC 7540, because all modern
+ * browsers use RFC 7540 for HTTP2 stream prioritization and use modern
+ * RFC 9218 only for HTTP3.
+ *
+ * Before developing of our own HTTP streams scheduling logic, we analyzed
+ * how other open source HTTP servers implement this.
+ * Nginx not fully support RFC 7540. A frame is inserted into the sending list
+ * according to the rank (the level in the priority tree) of the stream and
+ * weight. But it does not correspond to the RFC: a server should not send data
+ * for a stream which depends on other streams. Also the algorithm can lead to
+ * O(n) complexity (linear scan) if each next frame has higher priority than
+ * the previous one.
+ * H20 uses an O(1) approach described as an Array of Queue. This is the very
+ * fast scheduler but it has two main disadvantages - it consumes a lot of
+ * memory and is not fair.
+ * We decide to implement WFQ algorithm. There are a lot of data structures
+ * which can be used for this purpose (list, different type of heaps and
+ * different types of trees). We analyzed some of them (e.g. Fibonacci heap,
+ * RB-tree, insertion sorted array etc) and found that the HAproxy’s ebtree
+ * provides the best performance (at least x2 faster than the closest in
+ * performance Fibonacci heap) on small data (about 100 to 1000 streams in a
+ * queue) to pick a minimum item and reinsert it.
+ *
+ * We use deficit as a key in our priority ebtree. Deficit of the stream
+ * calculated as decribed below:
+ * new: deficit = min_deficit_in_heap + constant / weight
+ * exist: deficit = last_deficit + constant / weight
+ * 
+ * When we search for the most priority stream we iterate over the levels of
+ * the priority tree. For exanple:
+ *                     1 (256)
+ *          3 (256)              5 (1)
+ *     7 (256)   9 (1)    11 (256)     13 (1)
+ *
+ * In this example we have streams 3 and 5 which depend on stream 1,
+ * streams 7 and 9 which depend on stream 3, and streams 11 and 13, which
+ * depend on stream 5. We start from stream 1 and if it is active (has data
+ * to send and not blocked by HTTP window exceeding) we return it. If is not
+ * active but has active children we move to the next level of the tree
+ * (streams 3 and 5) and choose the stream (which is active or has active
+ * children) with the lowest deficit. We remove it from the tree and if it
+ * is active return it. Later after sending data for this stream we recalculate
+ * its deficit (deficit = deficit + constant / weight) and insert it back to
+ * the tree.
+ *
+ * Copyright (C) 2024 Tempesta Technologies, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+#include "http_stream_sched.h"
+#include "http_stream.h"
+#include "connection.h"
+
+static inline void
+tfw_h2_stream_sched_spin_lock_assert(TfwStreamSched *sched)
+{
+	TfwH2Ctx *ctx = container_of(sched, TfwH2Ctx, sched);
+	TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2);
+
+	/*
+	 * All scheduler functions schould be called under the
+	 * socket lock.
+	 */
+	assert_spin_locked(&((TfwConn *)conn)->sk->sk_lock.slock);
+}
+
+/**
+ * Remove stream from the ebtree of the blocked streams and insert
+ * it in the ebtree of active streams. Should be called only for
+ * active streams or the streams with active children, which is
+ * not already in the ebtree of active streams.
+ */
+static void
+tfw_h2_stream_sched_insert_active(TfwStream *stream, u64 deficit)
+{
+	TfwStreamSchedEntry *parent = stream->sched.parent;
+
+	BUG_ON(!parent || (!tfw_h2_stream_is_active(stream) &&
+	       !tfw_h2_stream_sched_is_active(&stream->sched)));
+	BUG_ON(stream->sched_state == HTTP2_STREAM_SCHED_STATE_ACTIVE);
+
+	eb64_delete(&stream->sched_node);
+	stream->sched_node.key = deficit;
+	eb64_insert(&parent->active, &stream->sched_node);
+	stream->sched_state = HTTP2_STREAM_SCHED_STATE_ACTIVE;
+}
+
+/**
+ * Remove stream from the ebtree of active streams and insert
+ * it in the ebtree of the blocked streams. Should be called
+ * only for the blocked streams and the streams without active
+ * children, which are not already in the ebtree of the blocked
+ * streams.
+ */
+static void
+tfw_h2_stream_sched_insert_blocked(TfwStream *stream, u64 deficit)
+{
+	TfwStreamSchedEntry *parent = stream->sched.parent;
+
+	BUG_ON(!parent || tfw_h2_stream_is_active(stream)
+	       || tfw_h2_stream_sched_is_active(&stream->sched));
+	BUG_ON(stream->sched_state == HTTP2_STREAM_SCHED_STATE_BLOCKED);
+
+	eb64_delete(&stream->sched_node);
+	stream->sched_node.key = deficit;
+	eb64_insert(&parent->blocked, &stream->sched_node);
+	stream->sched_state = HTTP2_STREAM_SCHED_STATE_BLOCKED;
+}
+
+/**
+ * Calculate minimum deficit for the current scheduler.
+ * New stream is inserted in the scheduler with
+ * deficit =  min_deficit + 65536 / stream->weight.
+ */
+static u64
+tfw_h2_stream_sched_min_deficit(TfwStreamSchedEntry *parent)
+{
+	TfwStream *prio;
+
+	/*
+	 * First of all check active streams in the scheduler.
+	 * If there are any active streams new stream is inserted
+	 * with deficit = min_deficit + 65536 / stream->weight.
+	 * Where min_deficit is a deficit of a most prio stream,
+	 * if it was scheduled at least one time.
+	 */
+	prio = !eb_is_empty(&parent->active) ?
+		eb64_entry(eb64_first(&parent->active), TfwStream, sched_node) :
+		NULL;
+	if (prio) {
+		return tfw_h2_stream_has_default_deficit(prio) ?
+			0 : prio->sched_node.key;
+	}
+
+	/* Same for blocked streams. */
+	prio = !eb_is_empty(&parent->blocked) ?
+		eb64_entry(eb64_first(&parent->blocked), TfwStream, sched_node) :
+		NULL;
+	if (prio) {
+		return tfw_h2_stream_has_default_deficit(prio) ?
+			0 : prio->sched_node.key;
+	}
+
+	return 0;
+}
+
+/**
+ * Recalculate count of active streams for parent schedulers, when
+ * new stream is added to the priority tree. If parent scheduler
+ * is activated in this function, insert appropriate parent stream
+ * in the tree of active streams.
+ */
+static void
+tfw_h2_stream_sched_propagate_add_active_cnt(TfwStreamSched *sched,
+					     TfwStream *stream)
+{
+	TfwStreamSchedEntry *parent = stream->sched.parent;
+	bool stream_is_active = tfw_h2_stream_is_active(stream);
+	long int active_cnt =
+		stream->sched.active_cnt + (stream_is_active ? 1 : 0);
+
+	if (!active_cnt)
+		return;
+
+	while (true) {
+		bool need_activate = !tfw_h2_stream_sched_is_active(parent);
+		parent->active_cnt += active_cnt;
+		if (parent == &sched->root)
+			break;
+
+		stream = container_of(parent, TfwStream, sched);
+		parent = stream->sched.parent;
+		/*
+		 * Stream can have no parent if it is removed from
+		 * the scheduler due to priority tree rebuilding.
+		 */
+		if (!parent)
+			break;
+
+		if (need_activate && !tfw_h2_stream_is_active(stream)) {
+			BUG_ON(stream->sched_state != HTTP2_STREAM_SCHED_STATE_BLOCKED);
+			tfw_h2_stream_sched_insert_active(stream,
+							  stream->sched_node.key);
+		}
+	}
+}
+
+/**
+ * Recalculate count of active streams for parent schedulers, when
+ * new stream is removed from the priority tree. If parent scheduler
+ * is deactivated in this function, remove appropriate parent stream
+ * from the tree of active streams.
+ */
+static void
+tfw_h2_stream_sched_propagate_dec_active_cnt(TfwStreamSched *sched,
+					     TfwStream *stream)
+{
+	TfwStreamSchedEntry *parent = stream->sched.parent;
+	bool stream_is_active = tfw_h2_stream_is_active(stream);
+	long int active_cnt =
+		stream->sched.active_cnt + (stream_is_active ? 1 : 0);
+
+	if (!active_cnt)
+		return;
+
+	while (true) {
+		parent->active_cnt -= active_cnt;
+		if (parent == &sched->root)
+			break;
+
+		stream = container_of(parent, TfwStream, sched);
+		parent = stream->sched.parent;
+		/*
+		 * Stream can have no parent if it is removed from
+		 * the scheduler due to priority tree rebuilding.
+		 */
+		if (!parent)
+			break;
+
+		if (tfw_h2_stream_is_active(stream)
+		    || tfw_h2_stream_sched_is_active(&stream->sched))
+			continue;
+
+		BUG_ON(stream->sched_state != HTTP2_STREAM_SCHED_STATE_ACTIVE);
+		tfw_h2_stream_sched_insert_blocked(stream, stream->sched_node.key);
+	}
+}
+
+/**
+ * Remove stream from the scheduler. Since this function is
+ * used when we delete stream also we should explicitly remove
+ * stream both from the tree. It is a caller responsibility
+ * to add stream again to the scheduler if it is necessary
+ * with appropriate deficite.
+ */
+void
+tfw_h2_stream_sched_remove(TfwStreamSched *sched, TfwStream *stream)
+{
+	TfwStreamSchedEntry *parent = stream->sched.parent;
+	
+	tfw_h2_stream_sched_spin_lock_assert(sched);
+
+	eb64_delete(&stream->sched_node);
+	stream->sched_state = HTTP2_STREAM_SCHED_STATE_UNKNOWN;
+	tfw_h2_stream_sched_propagate_dec_active_cnt(sched, stream);
+	stream->sched.parent = NULL;
+	parent->total_weight -= stream->weight;
+}
+
+/**
+ * Find parent scheduler by id of the parent stream. If id == 0 or
+ * we can't find parent stream return root scheduler according to
+ * RFC 7540 5.3.1.
+ */
+TfwStreamSchedEntry *
+tfw_h2_find_stream_dep(TfwStreamSched *sched, unsigned int id)
+{
+	tfw_h2_stream_sched_spin_lock_assert(sched);
+
+	if (id) {
+		TfwStream *stream = tfw_h2_find_stream(sched, id);
+		if (stream)
+			return &stream->sched;
+	}
+	/*
+	 * RFC 7540 5.3.1:
+	 * A dependency on a stream that is not currently in the tree -- such
+	 * as a stream in the "idle" state -- results in that stream being
+	 * given a default priority.
+	 */
+	return &sched->root;
+}
+
+static inline bool
+tfw_h2_stream_sched_has_children(TfwStreamSchedEntry *entry)
+{
+	return !eb_is_empty(&entry->active) || !eb_is_empty(&entry->blocked);
+}
+
+static inline void
+tfw_h2_stream_sched_move_child(TfwStreamSched *sched, TfwStream *child,
+			       TfwStreamSchedEntry *parent, u64 deficit)
+{
+	tfw_h2_stream_sched_remove(sched, child);
+	tfw_h2_sched_stream_enqueue(sched, child, parent, deficit);
+}
+
+/**
+ * Add stream to the scheduler tree. @dep is a parent of new
+ * added stream.
+ */
+void
+tfw_h2_add_stream_dep(TfwStreamSched *sched, TfwStream *stream,
+		      TfwStreamSchedEntry *dep, bool excl)
+{
+	u64 deficit, min_deficit;
+	bool stream_has_children;
+
+	tfw_h2_stream_sched_spin_lock_assert(sched);
+
+	if (!excl) {
+		deficit = tfw_h2_stream_sched_min_deficit(dep) +
+			tfw_h2_stream_default_deficit(stream);
+		tfw_h2_sched_stream_enqueue(sched, stream, dep, deficit);
+		return;
+	}
+
+	/*
+	 * Here we move children of dep scheduler to the current stream
+	 * scheduler. If current stream scheduler has no children we move
+	 * dep children as is (saving there deficit in the priority WFQ).
+	 * Otherwise we calculate minimal deficit of the scheduler and use
+	 * it as a base of new children deficit.
+	 */
+	stream_has_children = tfw_h2_stream_sched_has_children(&stream->sched);
+	min_deficit = !stream_has_children ? 0 :
+		tfw_h2_stream_sched_min_deficit(&stream->sched);
+
+	/*
+	 * RFC 7540 5.3.1:
+	 * An exclusive flag allows for the insertion of a new level of
+	 * dependencies. The exclusive flag causes the stream to become the
+	 * sole dependency of its parent stream, causing other dependencies
+	 * to become dependent on the exclusive stream.
+	 */
+	while (!eb_is_empty(&dep->blocked)) {
+		struct eb64_node *node = eb64_first(&dep->blocked);
+		TfwStream *child = eb64_entry(node, TfwStream, sched_node);
+
+		deficit = !stream_has_children ? child->sched_node.key :
+			min_deficit + tfw_h2_stream_default_deficit(child);
+		tfw_h2_stream_sched_move_child(sched, child, &stream->sched,
+					       deficit);
+	}
+
+	while (!eb_is_empty(&dep->active)) {
+		struct eb64_node *node = eb64_first(&dep->active);
+		TfwStream *child = eb64_entry(node, TfwStream, sched_node);
+
+		deficit = !stream_has_children ? child->sched_node.key :
+			min_deficit + tfw_h2_stream_default_deficit(child);
+		tfw_h2_stream_sched_move_child(sched, child, &stream->sched,
+					       deficit);
+	}
+
+	BUG_ON(tfw_h2_stream_sched_has_children(dep));
+	/* Stream is the only one in dep scheduler, use default deficit. */
+	tfw_h2_sched_stream_enqueue(sched, stream, dep,
+				    tfw_h2_stream_default_deficit(stream));
+}
+
+/**
+ * Remove stream from the dependency tree. Move it's children to its
+ * parent scheduler according RFC 7540.
+ */
+void
+tfw_h2_remove_stream_dep(TfwStreamSched *sched, TfwStream *stream)
+{
+	TfwStreamSchedEntry *parent = stream->sched.parent;
+	size_t total_weight = stream->sched.total_weight;
+	unsigned short new_weight;
+	bool parent_has_children;
+	u64 deficit;
+
+	tfw_h2_stream_sched_spin_lock_assert(sched);
+
+	/* Remove stream from the parent scheduler. */
+	tfw_h2_stream_sched_remove(sched, stream);
+
+	/*
+	 * Here we move children of the removed stream to the parent
+	 * scheduler. If parent scheduler has no children we move
+	 * current removed stream children as is (saving there deficit
+	 * in the priority WFQ). Otherwise we put them in the parent
+	 * scheduler with current removed stream deficit. We can't
+	 * save children deficit, because it has no matter for the
+	 * parent scheduler WFQ.
+	 */
+	parent_has_children = tfw_h2_stream_sched_has_children(parent);
+
+	/*
+	 * According to RFC 7540 section 5.3.4:
+	 * If the parent stream is removed from the tree, the weight of the
+	 * parent stream is divided between it's children according to there
+	 * weights.
+	 */
+	while (!eb_is_empty(&stream->sched.blocked)) {
+		struct eb64_node *node = eb64_first(&stream->sched.blocked);
+		TfwStream *child = eb64_entry(node, TfwStream, sched_node);
+
+		/*
+		 * Remove children of the removed stream, recalculate there
+		 * weights and add them to the scheduler of the parent of
+		 * the removed stream.
+		 */
+		new_weight = child->weight *
+			stream->weight / total_weight;
+		child->weight = new_weight > 0 ? new_weight : 1;
+		deficit = !parent_has_children ?
+			child->sched_node.key : stream->sched_node.key;
+		tfw_h2_stream_sched_move_child(sched, child, parent, deficit);
+	}
+
+	while (!eb_is_empty(&stream->sched.active)) {
+		struct eb64_node *node = eb64_first(&stream->sched.active);
+		TfwStream *child = eb64_entry(node, TfwStream, sched_node);
+
+		/*
+		 * Remove children of the removed stream, recalculate there
+		 * weights and add them to the scheduler of the parent of
+		 * the removed stream.
+		 */
+		new_weight = child->weight *
+			stream->weight / total_weight;
+		child->weight = new_weight > 0 ? new_weight : 1;
+		deficit = !parent_has_children ?
+			child->sched_node.key : stream->sched_node.key;
+		tfw_h2_stream_sched_move_child(sched, child, parent, deficit);
+	}
+
+	BUG_ON(stream->sched.active_cnt);
+}
+
+/**
+ * Check if the stream is now depends from it's child.
+ */
+static bool
+tfw_h2_is_stream_depend_on_child(TfwStreamSched *sched, TfwStream *stream,
+				 TfwStreamSchedEntry *new_parent)
+{
+	TfwStreamSchedEntry *parent = new_parent->parent;
+	TfwStream *next;
+
+	while (parent && parent != &sched->root) {
+		next = container_of(parent, TfwStream, sched);
+		if (next == stream)
+			return true;
+		parent = parent->parent;
+	}
+
+	return false;
+}
+
+void
+tfw_h2_change_stream_dep(TfwStreamSched *sched, unsigned int stream_id,
+			 unsigned int new_dep, unsigned short new_weight,
+			 bool excl)
+{
+	TfwStreamSchedEntry *old_parent, *new_parent;
+	TfwStream *stream, *np;
+	bool is_stream_depends_on_child;
+
+	tfw_h2_stream_sched_spin_lock_assert(sched);
+
+	stream = tfw_h2_find_stream(sched, stream_id);
+	BUG_ON(!stream);
+	old_parent = stream->sched.parent;
+	BUG_ON(!old_parent);
+
+	new_parent = tfw_h2_find_stream_dep(sched, new_dep);
+
+	is_stream_depends_on_child =
+		tfw_h2_is_stream_depend_on_child(sched, stream, new_parent);
+
+	if (!is_stream_depends_on_child) {
+		/*
+		 * If stream is not dependent from it's child, just remove
+		 * this stream change it's weight and add stream to the
+		 * new parent.
+		 * The order of calling next functions is important:
+		 * 1. First we should remove current stream from the
+		 *    dependency tree (with recalculation of total
+		 *    weight of parent schedulers).
+		 * 2. Change stream weight.
+		 * 3. Insert stream in the dependency tree as a
+		 *    child of the new parent.
+		 */
+		tfw_h2_stream_sched_remove(sched, stream);
+		stream->weight = new_weight;
+		tfw_h2_add_stream_dep(sched, stream, new_parent, excl);
+	} else {
+		/*
+		 * If stream is dependent from it's child, remove this
+		 * child from the dependency tree, put this child to the
+		 * location of the current stream and then add current
+		 * stream as a child of the new parent (which was a child
+		 * of current stream).
+		 * (See RFC 7540 section 5.3.3).
+		 * The order of calling next functions is important:
+		 * 1. Remove new parent, which is a child of current stream.
+		 *    (with recalculation of weight and active count of current
+		 *    stream scheduler).
+		 * 2. Remove current stream from the dependency tree.
+		 * 3. Change stream weight and insert new parent and stream
+		 *    according RFC 7540.
+		 */
+		BUG_ON(new_parent == &sched->root);
+		np = container_of(new_parent, TfwStream, sched);
+
+		tfw_h2_stream_sched_remove(sched, np);
+		tfw_h2_stream_sched_remove(sched, stream);
+		stream->weight = new_weight;
+		tfw_h2_add_stream_dep(sched, np, old_parent, false);
+		tfw_h2_add_stream_dep(sched, stream, new_parent, excl);
+	}
+
+}
+
+void
+tfw_h2_sched_stream_enqueue(TfwStreamSched *sched, TfwStream *stream,
+			    TfwStreamSchedEntry *parent, u64 deficit)
+{
+	tfw_h2_stream_sched_spin_lock_assert(sched);
+
+	parent->total_weight += stream->weight;
+	stream->sched.parent = parent;
+
+	/*
+	 * This function should be called only for new created streams or
+	 * streams which were previously removed from the scheduler.
+	 */
+	BUG_ON(stream->sched_node.node.leaf_p);
+
+	if (tfw_h2_stream_is_active(stream)
+	    || tfw_h2_stream_sched_is_active(&stream->sched))
+		tfw_h2_stream_sched_insert_active(stream, deficit);
+	else
+		tfw_h2_stream_sched_insert_blocked(stream, deficit);
+
+	tfw_h2_stream_sched_propagate_add_active_cnt(sched, stream);
+}
+
+TfwStream *
+tfw_h2_sched_stream_dequeue(TfwStreamSched *sched, TfwStreamSchedEntry **parent)
+{
+	TfwStreamSchedEntry *entry = &sched->root;
+	struct eb64_node *node = eb64_first(&entry->active);
+	u64 deficit;
+
+	while (node) {
+		TfwStream *stream = eb64_entry(node, TfwStream, sched_node);
+
+		if (tfw_h2_stream_is_active(stream)) {
+			*parent = entry;
+			tfw_h2_stream_sched_remove(sched, stream);
+			return stream;
+		} else if (tfw_h2_stream_sched_is_active(&stream->sched)) {
+			/*
+			 * This stream is blocked, but have active children, try
+			 * to use one of them.
+			 */
+			*parent = stream->sched.parent;
+			tfw_h2_stream_sched_remove(sched, stream);
+			deficit = tfw_h2_stream_recalc_deficit(stream);
+			tfw_h2_sched_stream_enqueue(sched, stream, *parent,
+						    deficit);
+			entry = &stream->sched;
+			node = eb64_first(&entry->active);
+		} else {
+			/*
+			 * Since node is in active tree it should be active or
+			 * has active children.
+			 */
+			BUG();
+		}
+	}
+
+	return NULL;
+}
+
+void
+tfw_h2_sched_activate_stream(TfwStreamSched *sched, TfwStream *stream)
+{
+	TfwStreamSchedEntry *parent = stream->sched.parent;
+
+	tfw_h2_stream_sched_spin_lock_assert(sched);
+	BUG_ON(!tfw_h2_stream_is_active(stream));
+	BUG_ON(!parent);
+
+	if (!tfw_h2_stream_sched_is_active(&stream->sched))
+		tfw_h2_stream_sched_insert_active(stream, stream->sched_node.key);
+
+	while (true) {
+		bool need_activate = !tfw_h2_stream_sched_is_active(parent);
+		parent->active_cnt += 1;
+		if (parent == &sched->root)
+			break;	
+
+		stream = container_of(parent, TfwStream, sched);
+		parent = stream->sched.parent;
+		BUG_ON(!parent);
+
+		if (need_activate && !tfw_h2_stream_is_active(stream))
+		    	tfw_h2_stream_sched_insert_active(stream, stream->sched_node.key);
+	}
+}
diff --git a/fw/http_stream_sched.h b/fw/http_stream_sched.h
new file mode 100644
index 000000000..0767e5a11
--- /dev/null
+++ b/fw/http_stream_sched.h
@@ -0,0 +1,95 @@
+/**
+ *		Tempesta FW
+ *
+ * Copyright (C) 2024 Tempesta Technologies, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+#ifndef __HTTP_STREAM_SCHED__
+#define __HTTP_STREAM_SCHED__
+
+#include <linux/rbtree.h>
+#include <linux/list.h>
+
+#include "lib/eb64tree.h"
+#include "http_types.h"
+
+/**
+ * @total_weight - total weight of the streams for this scheduler;
+ * @active_cnt	 - count of active child streams for this scheduler;
+ * @parent	 - parent scheduler;
+ * @active	 - root of the active streams scheduler ebtree;
+ * @blocked	 - root of the blocked streams scheduler ebtree;
+ */ 
+typedef struct tfw_stream_sched_entry_t {
+	u64				total_weight;
+	long int			active_cnt;
+	struct tfw_stream_sched_entry_t	*parent;
+	struct eb_root			active;
+	struct eb_root			blocked;
+} TfwStreamSchedEntry;
+
+/**
+ * Scheduler for stream's processing distribution based on dependency/priority
+ * values.
+ *
+ * @streams		- root red-black tree entry for per-connection streams storage;
+ * @root		- root scheduler of per-connection priority tree;
+ * @blocked_streams	- count of blocked streams;
+ */
+typedef struct tfw_stream_sched_t {
+	struct rb_root		streams;
+	TfwStreamSchedEntry	root;
+	long int		blocked_streams;
+} TfwStreamSched;
+
+TfwStreamSchedEntry *tfw_h2_find_stream_dep(TfwStreamSched *sched,
+					    unsigned int id);
+void tfw_h2_add_stream_dep(TfwStreamSched *sched, TfwStream *stream,
+			   TfwStreamSchedEntry *dep, bool excl);
+void tfw_h2_remove_stream_dep(TfwStreamSched *sched, TfwStream *stream);
+void tfw_h2_change_stream_dep(TfwStreamSched *sched, unsigned int stream_id,
+			      unsigned int new_dep, unsigned short new_weight,
+			      bool excl);
+
+void tfw_h2_stream_sched_remove(TfwStreamSched *sched, TfwStream *stream);
+void tfw_h2_sched_stream_enqueue(TfwStreamSched *sched, TfwStream *stream,
+				 TfwStreamSchedEntry *parent, u64 deficit);
+TfwStream *tfw_h2_sched_stream_dequeue(TfwStreamSched *sched,
+				       TfwStreamSchedEntry **parent);
+void tfw_h2_sched_activate_stream(TfwStreamSched *sched, TfwStream *stream);
+
+static inline bool
+tfw_h2_stream_sched_is_active(TfwStreamSchedEntry *sched)
+{
+	return sched->active_cnt;
+}
+
+static inline void
+tfw_h2_init_stream_sched_entry(TfwStreamSchedEntry *entry)
+{
+	entry->total_weight = entry->active_cnt = 0;
+	entry->parent = NULL;
+	entry->blocked = entry->active = EB_ROOT;
+}
+
+static inline void
+tfw_h2_init_stream_sched(TfwStreamSched *sched)
+{
+	sched->streams = RB_ROOT;
+	tfw_h2_init_stream_sched_entry(&sched->root);
+}
+
+#endif /* __HTTP_STREAM_SCHED__ */
diff --git a/fw/http_types.h b/fw/http_types.h
index d70f2337c..f767592e7 100644
--- a/fw/http_types.h
+++ b/fw/http_types.h
@@ -79,8 +79,6 @@ enum {
         TFW_HTTP_B_H2,
         /* Message has all mandatory pseudo-headers (applicable for HTTP/2 mode only) */
         TFW_HTTP_B_H2_HDRS_FULL,
-        /* Message in HTTP/2 transformation (applicable for HTTP/2 mode only). */
-        TFW_HTTP_B_H2_TRANS_ENTERED,
 
         /* Request flags. */
         TFW_HTTP_FLAGS_REQ,
@@ -126,6 +124,11 @@ enum {
         TFW_HTTP_B_HDR_ETAG_HAS_NO_QOUTES,
         /* Request URI is absolute (HTTP/1.x only) */
         TFW_HTTP_B_ABSOLUTE_URI,
+        /*
+         * This is the error response, connection
+         * will be closed after sending it.
+         */
+        TFW_HTTP_B_CLOSE_ERROR_RESPONSE,
 
         _TFW_HTTP_FLAGS_NUM
 };
diff --git a/fw/main.c b/fw/main.c
index ce6ec5010..54e51324c 100644
--- a/fw/main.c
+++ b/fw/main.c
@@ -2,7 +2,7 @@
  *		Tempesta FW
  *
  * Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com).
- * Copyright (C) 2015-2023 Tempesta Technologies, Inc.
+ * Copyright (C) 2015-2024 Tempesta Technologies, Inc.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by
@@ -346,13 +346,16 @@ tfw_ctlfn_state_io(struct ctl_table *ctl, int is_write,
 
 	if (is_write) {
 		char buf[T_SYSCTL_STBUF_LEN];
+		char start[T_SYSCTL_STBUF_LEN] = "start";
+		char stop[T_SYSCTL_STBUF_LEN] = "stop";
+
 		tmp.data = buf;
 		if ((r = proc_dostring(&tmp, is_write, user_buf, lenp, ppos)))
 			goto out;
 
 		r = tfw_ctlfn_state_change(buf);
 		strscpy(new_state_buf,
-			tfw_runstate_is_started() ? "start" : "stop",
+			tfw_runstate_is_started() ? start : stop,
 			T_SYSCTL_STBUF_LEN);
 	} else {
 		tmp.data = new_state_buf;
diff --git a/fw/msg.c b/fw/msg.c
index 061d51163..d23eacad0 100644
--- a/fw/msg.c
+++ b/fw/msg.c
@@ -97,42 +97,6 @@ int tfw_http_iter_set_at(TfwMsgIter *it, char *off)
 	return -E2BIG;
 }
 
-char *
-tfw_http_iter_set_at_skb(TfwMsgIter *it, struct sk_buff *skb,
-			 unsigned long off)
-{
-	char *begin, *end;
-	unsigned long d;
-	unsigned char i;
-
-	if (skb_headlen(it->skb)) {
-		begin = it->skb->data;
-		end = begin + skb_headlen(it->skb);
-
-		if (begin + off <= end) {
-			it->frag = -1;
-			return begin + off;
-		}
-		off -= skb_headlen(it->skb);
-	}
-
-	for (i = 0; i < skb_shinfo(it->skb)->nr_frags; i++) {
-		skb_frag_t *f = &skb_shinfo(it->skb)->frags[i];
-
-		begin = skb_frag_address(f);
-		end = begin + skb_frag_size(f);
-		d = end - begin;
-		if (off >= d) {
-			off -= d;
-			continue;
-		}
-		it->frag = i;
-		return begin + off;
-	}
-
-	return NULL;
-}
-
 /**
  * Move message iterator from @data pointer by @sz symbols right.
  * @sz must be less than remaining message size, otherwise an error will be
diff --git a/fw/msg.h b/fw/msg.h
index da008675a..2cd555f60 100644
--- a/fw/msg.h
+++ b/fw/msg.h
@@ -25,7 +25,7 @@
 
 #include <linux/skbuff.h>
 
-#include "sync_socket.h"
+#include "str.h"
 
 /**
  * @seq_list	- member in the ordered queue of messages;
@@ -100,24 +100,8 @@ int tfw_msg_iter_setup(TfwMsgIter *it, struct sk_buff **skb_head,
 		       size_t data_len, unsigned int flags);
 int tfw_msg_iter_append_skb(TfwMsgIter *it);
 int tfw_http_iter_set_at(TfwMsgIter *it, char *off);
-char *tfw_http_iter_set_at_skb(TfwMsgIter *it, struct sk_buff *skb,
-		               unsigned long off);
 int tfw_msg_iter_move(TfwMsgIter *it, unsigned char **data, unsigned long sz);
 
-static inline void
-tfw_msg_iter_set_skb_priv(TfwMsgIter *it, unsigned int priv,
-                          unsigned short flags)
-{
-        struct sk_buff *skb = it->skb;
-        do {
-                if (flags)
-                        skb_set_tfw_flags(skb, flags);
-                if (priv)
-                        skb_set_tfw_cb(skb, priv);
-                skb = skb->next;
-        } while (skb != it->skb_head);
-}
-
 static inline int
 tfw_msg_iter_next_data_frag(TfwMsgIter *it)
 {
diff --git a/fw/sock.c b/fw/sock.c
index e8240f932..61ffd344a 100644
--- a/fw/sock.c
+++ b/fw/sock.c
@@ -2,7 +2,7 @@
  *		Synchronous Socket API.
  *
  * Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com).
- * Copyright (C) 2015-2023 Tempesta Technologies, Inc.
+ * Copyright (C) 2015-2024 Tempesta Technologies, Inc.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by
@@ -40,12 +40,6 @@
 #include "work_queue.h"
 #include "http_limits.h"
 
-typedef enum {
-	SS_SEND,
-	SS_CLOSE,
-	SS_SHUTDOWN,
-} SsAction;
-
 typedef struct {
 	struct sock	*sk;
 	struct sk_buff	*skb_head;
@@ -197,7 +191,7 @@ static void
 ss_conn_drop_guard_exit(struct sock *sk)
 {
 	kernel_fpu_begin();
-	SS_CONN_TYPE(sk) &= ~(Conn_Closing | Conn_Shutdown);
+	SS_CONN_TYPE(sk) &= ~Conn_Closing;
 	SS_CALL(connection_drop, sk);
 	if (sk->sk_security)
 		tfw_classify_conn_close(sk);
@@ -372,30 +366,48 @@ ss_forced_mem_schedule(struct sock *sk, int size)
 	sk_memory_allocated_add(sk, amt);
 }
 
-/**
- * @skb_head can be invalid after the function call, don't try to use it.
- */
-static void
-ss_do_send(struct sock *sk, struct sk_buff **skb_head, int flags)
+void
+ss_skb_tcp_entail(struct sock *sk, struct sk_buff *skb, unsigned int mark,
+	      unsigned char tls_type)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct sk_buff *skb, *head = *skb_head;
-	int size, mss = tcp_send_mss(sk, &size, MSG_DONTWAIT);
-	unsigned int mark = (*skb_head)->mark;
 
-	T_DBG3("[%d]: %s: sk=%pK queue_empty=%d send_head=%pK"
-	       " sk_state=%d mss=%d size=%d\n",
-	       smp_processor_id(), __func__,
-	       sk, tcp_write_queue_empty(sk), tcp_send_head(sk),
-	       sk->sk_state, mss, size);
+	ss_skb_on_tcp_entail(sk->sk_user_data, skb);
+	ss_skb_init_for_xmit(skb);
+	skb->mark = mark;
+	if (tls_type)
+		skb_set_tfw_tls_type(skb, tls_type);
+	ss_forced_mem_schedule(sk, skb->truesize);
+	tcp_skb_entail(sk, skb);
+	tp->write_seq += skb->len;
+	TCP_SKB_CB(skb)->end_seq += skb->len;
+
+	T_DBG3("[%d]: %s: entail sk=%pK skb=%pK data_len=%u len=%u"
+	       " truesize=%u mark=%u tls_type=%x\n",
+	       smp_processor_id(), __func__, sk, skb, skb->data_len,
+	       skb->len, skb->truesize, skb->mark,
+	       skb_tfw_tls_type(skb));
+}
 
-	/* If the socket is inactive, there's no recourse. Drop the data. */
-	if (unlikely(!ss_sock_active(sk))) {
-		ss_skb_queue_purge(skb_head);
-		return;
-	}
+void
+ss_skb_tcp_entail_list(struct sock *sk, struct sk_buff **skb_head)
+{
+	struct sk_buff *skb;
+	unsigned char tls_type = 0;
+	unsigned int mark = 0;
 
 	while ((skb = ss_skb_dequeue(skb_head))) {
+		/*
+		 * @skb_head can be the head of several different skb
+		 * lists. We set tls type for the head of each new
+		 * skb list and we should entail each skb with mark
+		 * and tls_type of the head of the list to which it
+		 * belongs.
+		 */
+		if (TFW_SKB_CB(skb)->is_head) {
+			tls_type = skb_tfw_tls_type(skb);
+			mark = skb->mark;
+		}
 		/*
 		 * Zero-sized SKBs may appear when the message headers (or any
 		 * other contents) are modified or deleted by Tempesta. Drop
@@ -408,28 +420,42 @@ ss_do_send(struct sock *sk, struct sk_buff **skb_head, int flags)
 			kfree_skb(skb);
 			continue;
 		}
+		ss_skb_tcp_entail(sk, skb, mark, tls_type);
+	}
+}
 
-		ss_skb_init_for_xmit(skb);
-		if (flags & SS_F_ENCRYPT) {
-			skb_set_tfw_tls_type(skb, SS_SKB_F2TYPE(flags));
-			if (skb == head)
-				skb_set_tfw_flags(skb, SS_F_HTTP2_FRAME_START);
-		}
-		/* Propagate mark of message head skb.*/
-		skb->mark = mark;
+/**
+ * @skb_head can be invalid after the function call, don't try to use it.
+ */
+static void
+ss_do_send(struct sock *sk, struct sk_buff **skb_head, int flags)
+{
+	int size, mss = tcp_send_mss(sk, &size, MSG_DONTWAIT);
+	void *conn = sk->sk_user_data;
+	unsigned char tls_type = flags & SS_F_ENCRYPT ?
+		SS_SKB_F2TYPE(flags) : 0;
+
+	T_DBG3("[%d]: %s: sk=%pK queue_empty=%d send_head=%pK"
+	       " sk_state=%d mss=%d size=%d\n",
+	       smp_processor_id(), __func__,
+	       sk, tcp_write_queue_empty(sk), tcp_send_head(sk),
+	       sk->sk_state, mss, size);
+
+	/* If the socket is inactive, there's no recourse. Drop the data. */
+	if (unlikely(!conn || !ss_sock_active(sk)))
+		goto cleanup;
 
-		T_DBG3("[%d]: %s: entail sk=%pK skb=%pK data_len=%u len=%u"
-		       " truesize=%u mark=%u tls_type=%x\n",
-		       smp_processor_id(), __func__, sk,
-		       skb, skb->data_len, skb->len, skb->truesize, skb->mark,
-		       skb_tfw_tls_type(skb));
+	ss_skb_setup_head_of_list(*skb_head, (*skb_head)->mark, tls_type);
 
-		ss_forced_mem_schedule(sk, skb->truesize);
-		tcp_skb_entail(sk, skb);
+	if (ss_skb_on_send(conn, skb_head))
+		goto cleanup;
 
-		tp->write_seq += skb->len;
-		TCP_SKB_CB(skb)->end_seq += skb->len;
-	}
+	/*
+	 * If skbs were pushed to scheuler tree, @skb_head is
+	 * empty and `ss_skb_tcp_entail_list` doesn't make
+	 * any job.
+	 */
+	ss_skb_tcp_entail_list(sk, skb_head);
 
 	T_DBG3("[%d]: %s: sk=%p send_head=%p sk_state=%d flags=%x\n",
 	       smp_processor_id(), __func__,
@@ -442,7 +468,34 @@ ss_do_send(struct sock *sk, struct sk_buff **skb_head, int flags)
 	if (flags & SS_F_CONN_CLOSE)
 		return;
 
-	tcp_push(sk, MSG_DONTWAIT, mss, TCP_NAGLE_OFF|TCP_NAGLE_PUSH, size);
+	/*
+	 * We set SOCK_TEMPESTA_HAS_DATA when we add some skb in our
+	 * scheduler tree.
+	 * So there are two cases here:
+	 * - packets out is equal to zero and sock flag is set,
+	 *   this means that we should call `tcp_push_pending_frames`.
+	 *   In this function our scheduler choose the most priority
+	 *   stream, make frames for this stream and push them to the
+	 *   socket write queue.
+	 * - socket flag is not set, this means that we push skb directly
+	 *   to the socket write queue so we call `tcp_push` and don't
+	 *   run scheduler.
+	 * If packets_out is not equal to zero `tcp_push_pending_frames`
+	 * will be called later from `tcp_data_snd_check` when we receive
+	 * ack from the peer.
+	 */
+	if (sock_flag(sk, SOCK_TEMPESTA_HAS_DATA)) {
+		tcp_push_pending_frames(sk);
+	} else {
+		tcp_push(sk, MSG_DONTWAIT, mss, TCP_NAGLE_OFF | TCP_NAGLE_PUSH,
+			 size);
+	}
+
+	return;
+
+cleanup:
+	ss_skb_destroy_opaque_data(*skb_head);
+	ss_skb_queue_purge(skb_head);
 }
 
 /**
@@ -606,6 +659,9 @@ ss_do_close(struct sock *sk, int flags)
 		tcp_set_state(sk, TCP_CLOSE);
 		tcp_send_active_reset(sk, sk->sk_allocation);
 	} else if (tcp_close_state(sk)) {
+		int size, mss = tcp_send_mss(sk, &size, MSG_DONTWAIT);
+		if (sk->sk_fill_write_queue)
+			sk->sk_fill_write_queue(sk, mss, SS_CLOSE);
 		tcp_send_fin(sk);
 	}
 
@@ -791,6 +847,7 @@ do {									\
 		 * own flags, thus clear it.
 		 */
 		skb->dev = NULL;
+		memset(skb->cb, 0, sizeof(skb->cb));
 
 		if (unlikely(offset >= skb->len)) {
 			offset -= skb->len;
@@ -1444,7 +1501,16 @@ __sk_close_locked(struct sock *sk, int flags)
 static inline void
 ss_do_shutdown(struct sock *sk)
 {
-	tcp_shutdown(sk, SEND_SHUTDOWN);
+	int size, mss = tcp_send_mss(sk, &size, MSG_DONTWAIT);
+	/*
+	 * We send `tcp_shutdown` from `sk_fill_write_queue` if
+	 * there is no pending data in our sceduler and SS_SHUTDOWN
+	 * is passed as ss_action.
+	 */
+	if (sk->sk_fill_write_queue)
+		sk->sk_fill_write_queue(sk, mss, SS_SHUTDOWN);
+	else
+		tcp_shutdown(sk, SEND_SHUTDOWN);
 	SS_CONN_TYPE(sk) |= Conn_Shutdown;
 }
 
@@ -1572,6 +1638,9 @@ ss_tx_action(void)
 		}
 dead_sock:
 		sock_put(sk); /* paired with push() calls */
+		if (sw.skb_head)
+			ss_skb_destroy_opaque_data(sw.skb_head);
+
 		while ((skb = ss_skb_dequeue(&sw.skb_head)))
 			kfree_skb(skb);
 	}
@@ -1806,6 +1875,27 @@ ss_active(void)
 	return READ_ONCE(__ss_active);
 }
 
+static inline int __init
+tfw_sync_socket_wq_init(int cpu)
+{
+	TfwRBQueue *wq = &per_cpu(si_wq, cpu);
+	int r;
+
+	r = tfw_wq_init(wq, max_t(unsigned int, TFW_DFLT_QSZ, __wq_size),
+			cpu_to_node(cpu));
+	if (unlikely(r))
+		return r;
+	init_irq_work(&per_cpu(ipi_work, cpu), ss_ipi);
+	return 0;
+}
+
+static inline void
+tfw_sync_socket_wq_cleanup(int cpu)
+{
+	irq_work_sync(&per_cpu(ipi_work, cpu));
+	tfw_wq_destroy(&per_cpu(si_wq, cpu));
+}
+
 int __init
 tfw_sync_socket_init(void)
 {
@@ -1819,17 +1909,12 @@ tfw_sync_socket_init(void)
 	__wq_size = ss_estimate_pcpu_wq_size();
 	for_each_online_cpu(cpu) {
 		SsCloseBacklog *cb = &per_cpu(close_backlog, cpu);
-		TfwRBQueue *wq = &per_cpu(si_wq, cpu);
 
-		r = tfw_wq_init(wq, max_t(unsigned int, TFW_DFLT_QSZ, __wq_size),
-				cpu_to_node(cpu));
-		if (r) {
-			T_ERR_NL("%s: Can't initialize softIRQ RX/TX work queue for CPU #%d\n",
-				 __func__, cpu);
-			kmem_cache_destroy(ss_cbacklog_cache);
-			return r;
+		if (unlikely((r = tfw_sync_socket_wq_init(cpu)))) {
+			T_ERR_NL("%s: Can't initialize softIRQ RX/TX work"
+				 " queue for CPU #%d\n", __func__, cpu);
+			goto cleanup;
 		}
-		init_irq_work(&per_cpu(ipi_work, cpu), ss_ipi);
 
 		INIT_LIST_HEAD(&cb->head);
 		spin_lock_init(&cb->lock);
@@ -1838,6 +1923,12 @@ tfw_sync_socket_init(void)
 	tempesta_set_tx_action(ss_tx_action);
 
 	return 0;
+
+cleanup:
+	for_each_online_cpu(cpu)
+		tfw_sync_socket_wq_cleanup(cpu);
+	kmem_cache_destroy(ss_cbacklog_cache);
+	return r;
 }
 
 void
@@ -1847,8 +1938,7 @@ tfw_sync_socket_exit(void)
 
 	tempesta_del_tx_action();
 	for_each_online_cpu(cpu) {
-		irq_work_sync(&per_cpu(ipi_work, cpu));
-		tfw_wq_destroy(&per_cpu(si_wq, cpu));
+		tfw_sync_socket_wq_cleanup(cpu);
 		ss_backlog_validate_cleanup(cpu);
 	}
 	kmem_cache_destroy(ss_cbacklog_cache);
diff --git a/fw/sock_clnt.c b/fw/sock_clnt.c
index 386835104..d13d520a7 100644
--- a/fw/sock_clnt.c
+++ b/fw/sock_clnt.c
@@ -34,6 +34,7 @@
 #include "server.h"
 #include "sync_socket.h"
 #include "tls.h"
+#include "tcp.h"
 
 /*
  * ------------------------------------------------------------------------
@@ -174,266 +175,14 @@ tfw_cli_conn_send(TfwCliConn *cli_conn, TfwMsg *msg)
 	return r;
 }
 
-/**
- * First `xmit` callback, which is used to add headers for HTTP2
- * HEADERS and DATA frames. Also used to add hpack dynamic table
- * size at the beginning of the first header block according to
- * RFC 7541. Implemented in separate function, because we use
- * `tso_fragment` with new limit to split skb before passing it
- * to the second `xmit` callback.
- */
-static int
-tfw_h2_sk_prepare_xmit(struct sock *sk, struct sk_buff *skb,
-		       unsigned int mss_now, unsigned int *limit,
-		       unsigned int *nskbs)
-{
-	TfwConn *conn = sk->sk_user_data;
-	unsigned short flags = skb_tfw_flags(skb);
-	unsigned int skb_priv = skb_tfw_cb(skb);
-	unsigned int truesize = 0, tmp_truesize = 0;
-	bool headers_was_done = false;
-	TfwH2Ctx *h2 = NULL;
-	TfwHPackETbl *tbl = NULL;
-	TfwStream *stream = NULL;
-	int r = 0;
-
-#define FRAME_HEADERS_SHOULD_BE_MADE(flags)				\
-	(flags & SS_F_HTTT2_FRAME_HEADERS)
-
-#define FRAME_DATA_SHOULD_BE_MADE(flags)				\
-	(flags & SS_F_HTTT2_FRAME_DATA)
-
-#define FRAME_HEADERS_OR_DATA_SHOULD_BE_MADE(flags)			\
-	(FRAME_HEADERS_SHOULD_BE_MADE(flags)				\
-	 || FRAME_DATA_SHOULD_BE_MADE(flags))
-
-#define FRAME_ALREADY_PREPARED(flags)					\
-	(flags & SS_F_HTTP2_FRAME_PREPARED)
-
-#define CHECK_STREAM_IS_PRESENT(stream)					\
-do {									\
-	h2 = tfw_h2_context_unsafe(conn);				\
-	tbl = &h2->hpack.enc_tbl;					\
-	stream = tfw_h2_find_not_closed_stream(h2, skb_priv, false);	\
-	if (!stream) {							\
-		T_WARN("%s: stream with id (%u) already closed",	\
-		       __func__, skb_priv);				\
-		/*							\
-		 * TODO #1196:						\
-		 * Don't purge tcp queue and don't close connection,	\
-		 * because we can still send data for other streams.	\
-		 */							\
-		r = -EPIPE;						\
-		goto ret;						\
-	}								\
-} while (0);
-
-#define TFW_H2_STREAM_SEND_PROCESS(h2, stream, type)			\
-	r = tfw_h2_stream_send_process(h2, stream, type);		\
-	if (unlikely(r != STREAM_FSM_RES_OK)) {				\
-		T_WARN("Failed to process stream %d", (int)r);		\
-		/*							\
-		 * TODO #1196:						\
-		 * drop all skbs for corresponding stream if		\
-		 * r == STREAM_FSM_RES_TERM_STREAM.			\
-		 */							\
-		if (r == STREAM_FSM_RES_TERM_CONN) {			\
-			r = -EPIPE;					\
-			goto ret;					\
-		}							\
-	}
-
-	BUG_ON(FRAME_ALREADY_PREPARED(flags));
-
-	/*
-	 * If some error occurs between `tcp_tfw_sk_prepare_xmit` and
-	 * `tcp_tfw_sk_write_xmit`, skb which was already processed will
-	 * be passed to this function again. We should not process this
-	 * skb, just update limit according to already processed bytes.
-	 */
-	if (FRAME_HEADERS_OR_DATA_SHOULD_BE_MADE(flags)) {
-		CHECK_STREAM_IS_PRESENT(stream);
-		tfw_h2_stream_xmit_reinit(&stream->xmit);
-		stream->xmit.nskbs = 1;
-	} else {
-		struct sk_buff *next = skb;
-		unsigned short flags;
-
-		/*
-		 * Here we deal with skbs which do not contain HEADERS or
-		 * DATA frames. They should be encrypted in separate tls
-		 * record.
-		 */
-		*nskbs = 1;
-		while (!tcp_skb_is_last(sk, next)) {
-			next = skb_queue_next(&sk->sk_write_queue, next);
-			flags = skb_tfw_flags(next);
-
-			if (FRAME_HEADERS_OR_DATA_SHOULD_BE_MADE(flags))
-				break;
-			(*nskbs)++;
-		}
-	}
-
-	if (flags & SS_F_HTTP2_ACK_FOR_HPACK_TBL_RESIZING) {
-		h2 = tfw_h2_context_unsafe(conn);
-		tbl = &h2->hpack.enc_tbl;
-
-		tfw_hpack_set_rbuf_size(tbl, skb_priv);
-		h2->rsettings.hdr_tbl_sz = tbl->window;
-		skb_clear_tfw_flag(skb, SS_F_HTTP2_ACK_FOR_HPACK_TBL_RESIZING);
-	}
-
-	/*
-	 * We should write new hpack dynamic table size at the
-	 * beginning of the first header block.
-	 */
-	if (flags & SS_F_HTTP2_FRAME_START &&
-	    !(flags & SS_F_HTTT2_HPACK_TBL_SZ_ENCODED)
-	    && FRAME_HEADERS_SHOULD_BE_MADE(flags))
-	{
-		r = tfw_hpack_enc_tbl_write_sz(tbl, sk, skb, stream,
-					       mss_now, &tmp_truesize);
-		if (unlikely(r)) {
-			T_WARN("%s: failed to encode new hpack dynamic "
-			       "table size (%d)", __func__, r);
-			goto ret;
-		}
-
-		flags |= (tmp_truesize ? SS_F_HTTT2_HPACK_TBL_SZ_ENCODED : 0);
-		skb_set_tfw_flags(skb, flags);
-	}
-
-	truesize += tmp_truesize;
-	tmp_truesize = 0;
-
-	if (FRAME_HEADERS_SHOULD_BE_MADE(flags)) {
-		if (*limit - stream->xmit.processed <= FRAME_HEADER_SIZE) {
-			r = -ENOMEM;
-			goto ret;
-		}
-
-		r = tfw_h2_make_headers_frames(sk, skb, h2, stream, mss_now,
-					       *limit - stream->xmit.processed,
-					       &tmp_truesize);
-		if (unlikely(r)) {
-			T_WARN("%s: failed to make headers frames (%d)",
-			       __func__, r);
-			goto ret;
-		}
-
-		truesize += tmp_truesize;
-		tmp_truesize = 0;
-		headers_was_done = true;
-
-		/*
-		 * We clear this flag to prevent it's copying
-		 * during skb splitting.
-		 */
-		if (!stream->xmit.h_len) {
-			skb_clear_tfw_flag(skb, SS_F_HTTT2_FRAME_HEADERS);
-			TFW_H2_STREAM_SEND_PROCESS(h2, stream, HTTP2_HEADERS);
-		}
-	}
-
-	if (FRAME_DATA_SHOULD_BE_MADE(flags)) {
-		if (stream->rem_wnd <= 0 || h2->rem_wnd <= 0
-		    || *limit - stream->xmit.processed <= FRAME_HEADER_SIZE) {
-			if (headers_was_done)
-				goto update_limit;
-			r = -ENOMEM;
-			goto ret;
-		}
-
-		r = tfw_h2_make_data_frames(sk, skb, h2, stream, mss_now,
-					    *limit - stream->xmit.processed,
-					    &tmp_truesize);
-		if (unlikely(r)) {
-			T_WARN("%s: failed to make data frames (%d)",
-			       __func__, r);
-			if (r == -ENOMEM && headers_was_done) {
-				r = 0;
-				goto update_limit;
-			}
-			goto ret;
-		}
-
-		truesize += tmp_truesize;
-		tmp_truesize = 0;
-
-		/*
-		 * We clear this flag to prevent it's copying
-		 * during skb splitting.
-		 */
-		if (!stream->xmit.b_len) {
-			skb_clear_tfw_flag(skb, SS_F_HTTT2_FRAME_DATA);
-			TFW_H2_STREAM_SEND_PROCESS(h2, stream, HTTP2_DATA);
-		}
-	}
-
-update_limit:
-	if (FRAME_HEADERS_OR_DATA_SHOULD_BE_MADE(flags)
-	    && stream && stream->xmit.nskbs == 1)
-		*limit = stream->xmit.processed;
-
-	if (skb->len > *limit) {
-		unsigned short saved_flags = skb_tfw_flags(skb);
-
-		/*
-		 * Hacky way to clear flags of skb that will be created after
-		 * splitting such skb must be with cleared flags, but
-		 * current skb must be with already set flags.
-		 */
-		skb->tfw_cb.flags &= (unsigned short)(~TEMPESTA_SKB_FLAG_CLEAR_MASK);
-		r = tso_fragment(sk, skb, *limit, mss_now,
-				 sk_gfp_mask(sk, GFP_ATOMIC));
-		skb->tfw_cb.flags = saved_flags;
-	}
-
-ret:
-	/* Reinit stream xmit context. */
-	if (stream)
-		*nskbs = !r ? stream->xmit.nskbs : 0;
-
-	/*
-	 * Since we add some data to skb, we should adjust the socket write
-	 * memory both in case of success and in case of failure.
-	 */
-	if (unlikely(ss_add_overhead(sk, truesize))) {
-		T_WARN("%s: failed to add overhead to current TCP "
-		       "socket control data.", __func__);
-		/*
-		 * In case of previous error return it,
-		 * otherwise return -ENOMEM.
-		 */
-		r = r ? r : -ENOMEM;
-	}
-
-	if (unlikely(r) && r != -ENOMEM) {
-		if (stream)
-			tfw_h2_stream_add_closed(h2, stream);
-	}
-
-	if (likely(!r))
-		skb_set_tfw_flags(skb, SS_F_HTTP2_FRAME_PREPARED);
-
-	return r;
-
-#undef TFW_H2_STREAM_SEND_PROCESS
-#undef CHECK_STREAM_IS_PRESENT
-#undef FRAME_ALREADY_PREPARED
-#undef FRAME_HEADERS_OR_DATA_SHOULD_BE_MADE
-#undef FRAME_DATA_SHOULD_BE_MADE
-#undef FRAME_HEADERS_SHOULD_BE_MADE
-}
-
 static int
-tfw_sk_prepare_xmit(struct sock *sk, struct sk_buff *skb, unsigned int mss_now,
-		    unsigned int *limit, unsigned int *nskbs)
+tfw_sk_fill_write_queue(struct sock *sk, unsigned int mss_now, int ss_action)
 {
 	TfwConn *conn = sk->sk_user_data;
-	bool h2_mode;
-	int r = 0;
+	TfwH2Ctx *h2;
+	bool data_is_available = false;
+	unsigned long snd_wnd;
+	int r;
 
 	assert_spin_locked(&sk->sk_lock.slock);
 	/*
@@ -445,38 +194,28 @@ tfw_sk_prepare_xmit(struct sock *sk, struct sk_buff *skb, unsigned int mss_now,
          */
 	BUG_ON(!conn);
 
-	*nskbs = UINT_MAX;
-	h2_mode = TFW_CONN_PROTO(conn) == TFW_FSM_H2;
-	if (h2_mode)
-		r = tfw_h2_sk_prepare_xmit(sk, skb, mss_now, limit, nskbs);
-
-	return r;
-}
-
-static int
-tfw_sk_write_xmit(struct sock *sk, struct sk_buff *skb, unsigned int mss_now,
-		  unsigned int limit, unsigned int nskbs)
-{
-	TfwConn *conn = sk->sk_user_data;
-	unsigned short flags;
-	bool h2_mode;
-	int r = 0;
-
-	assert_spin_locked(&sk->sk_lock.slock);
-	/* Same as for tfw_sk_prepare_xmit(). */
-	BUG_ON(!conn);
+	/*
+	 * This function can be called both for HTTP1 and HTTP2 connections.
+	 * Moreover this function can be called when HTTP2 connection is
+	 * shutdowned before TLS hadshake was finished.
+	 */
+	h2 = TFW_CONN_PROTO(conn) == TFW_FSM_H2 ?
+		tfw_h2_context_safe(conn) : NULL;
+	if (!h2) {
+		if (ss_action == SS_SHUTDOWN)
+			tcp_shutdown(sk, SEND_SHUTDOWN);
+		return 0;
+	}
 
-	h2_mode = TFW_CONN_PROTO(conn) == TFW_FSM_H2;
-	flags = skb_tfw_flags(skb);
+	snd_wnd = tfw_tcp_calc_snd_wnd(sk, mss_now);
 
-	r = tfw_tls_encrypt(sk, skb, mss_now, limit, nskbs);
+	r = tfw_h2_make_frames(sk, h2, snd_wnd, ss_action, &data_is_available);
+	if (unlikely(r < 0))
+		return r;
 
-	if (h2_mode && r != -ENOMEM && (flags & SS_F_HTTT2_HPACK_TBL_SZ_ENCODED)) {
-		TfwH2Ctx *h2 = tfw_h2_context_unsafe(conn);
-		TfwHPackETbl *tbl = &h2->hpack.enc_tbl;
+	if (!data_is_available)
+		sock_reset_flag(sk, SOCK_TEMPESTA_HAS_DATA);
 
-		tfw_hpack_enc_tbl_write_sz_release(tbl, r);
-	}
 	return r;
 }
 
@@ -545,8 +284,8 @@ tfw_sock_clnt_new(struct sock *sk)
 		 * upcall beside GFSM and SS, but that's efficient and I didn't
 		 * find a simple and better solution.
 		 */
-		sk->sk_prepare_xmit = tfw_sk_prepare_xmit;
-		sk->sk_write_xmit = tfw_sk_write_xmit;
+		sk->sk_write_xmit = tfw_tls_encrypt;
+		sk->sk_fill_write_queue = tfw_sk_fill_write_queue;
 	}
 
 	/* Activate keepalive timer. */
@@ -946,7 +685,7 @@ tfw_cfgop_keepalive_timeout(TfwCfgSpec *cs, TfwCfgEntry *ce)
 
 	if (tfw_cli_cfg_ka_timeout < 0) {
 		T_ERR_NL("Unable to parse 'keepalive_timeout' value: '%s'\n",
-			 "Value less the zero");
+			 "Value less then zero");
 		return -EINVAL;
 	}
 
diff --git a/fw/ss_skb.c b/fw/ss_skb.c
index 24a3c5f84..8f3ea3f81 100644
--- a/fw/ss_skb.c
+++ b/fw/ss_skb.c
@@ -231,7 +231,7 @@ __extend_pgfrags(struct sk_buff *skb_head, struct sk_buff *skb, int from, int n)
 
 		/* No fragments to shift. */
 		if (!tail_frags)
-			return 0;
+			goto finish;
 
 		/*
 		 * Move @n_excess number of page fragments to new SKB. We
@@ -262,6 +262,8 @@ __extend_pgfrags(struct sk_buff *skb_head, struct sk_buff *skb, int from, int n)
 	if (n_shift > 0)
 		memmove(&si->frags[from + n],
 			&si->frags[from], n_shift * sizeof(skb_frag_t));
+
+finish:
 	si->nr_frags += n - n_excess;
 
 	return 0;
@@ -1308,11 +1310,15 @@ ss_skb_init_for_xmit(struct sk_buff *skb)
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
 	__u8 pfmemalloc = skb->pfmemalloc;
 
-	WARN_ON_ONCE(skb->next || skb->prev);
 	WARN_ON_ONCE(skb->sk);
 
 	skb_dst_drop(skb);
 	INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
+	/*
+	 * Since we use skb->sb for our purpose we should
+	 * zeroed it before pass skb to the kernel.
+	 */
+	memset(skb->cb, 0, sizeof(skb->cb));
 
 	if (!skb_transport_header_was_set(skb)) {
 		/* Quick path for new skbs. */
@@ -1321,7 +1327,6 @@ ss_skb_init_for_xmit(struct sk_buff *skb)
 	}
 
 	skb->skb_mstamp_ns = 0;
-	bzero_fast(skb->cb, sizeof(skb->cb));
 	nf_reset_ct(skb);
 	skb->mac_len = 0;
 	skb->queue_mapping = 0;
diff --git a/fw/ss_skb.h b/fw/ss_skb.h
index bf2e3d3cf..98761d16b 100644
--- a/fw/ss_skb.h
+++ b/fw/ss_skb.h
@@ -23,6 +23,7 @@
 #define __TFW_SS_SKB_H__
 
 #include <linux/skbuff.h>
+#include <net/tcp.h>
 
 #include "str.h"
 #include "lib/log.h"
@@ -51,6 +52,77 @@ enum {
 	SS_OK		  = T_OK,
 };
 
+typedef int (*on_send_cb_t)(void *conn, struct sk_buff **skb_head);
+typedef void (*on_tcp_entail_t)(void *conn, struct sk_buff *skb_head);
+
+/*
+ * Tempesta FW sk_buff private data.
+ * @opaque_data 	- pointer to some private data (typically http response);
+ * @destructor		- destructor of the opaque data, should be set if data is
+ *                        not NULL
+ * @on_send		- callback to special handling this skb before sending;
+ * @on_tcp_entail 	- callback to special handling this skb before pushing
+ *                        to socket write queue;
+ * @stream_id		- id of sender stream;
+ * @is_head		- flag indicates that this is a head of skb list;
+ */
+struct tfw_skb_cb {
+	void 		*opaque_data;
+	void 		(*destructor)(void *opaque_data);
+	on_send_cb_t	on_send;
+	on_tcp_entail_t on_tcp_entail;
+	unsigned int 	stream_id;
+	bool		is_head;
+};
+
+#define TFW_SKB_CB(skb) ((struct tfw_skb_cb *)&((skb)->cb[0]))
+
+static inline void
+ss_skb_setup_head_of_list(struct sk_buff *skb_head, unsigned int mark,
+			  unsigned char tls_type)
+{
+	if (tls_type)
+		skb_set_tfw_tls_type(skb_head, tls_type);
+	skb_head->mark = mark;
+	TFW_SKB_CB(skb_head)->is_head = true;
+}
+
+static inline void
+ss_skb_destroy_opaque_data(struct sk_buff *skb_head)
+{
+	void *opaque_data = TFW_SKB_CB(skb_head)->opaque_data;
+	void (*destructor)(void *) = TFW_SKB_CB(skb_head)->destructor;
+
+	BUILD_BUG_ON(sizeof(struct tfw_skb_cb) >
+		     sizeof(((struct sk_buff *)(0))->cb));
+
+	if (opaque_data) {
+		BUG_ON(!destructor);
+		destructor(opaque_data);
+	}
+}
+
+static inline int
+ss_skb_on_send(void *conn, struct sk_buff **skb_head)
+{
+	on_send_cb_t on_send = TFW_SKB_CB(*skb_head)->on_send;
+	int r = 0;
+
+	if (on_send)
+		r = on_send(conn, skb_head);
+
+	return r;
+}
+
+static inline void
+ss_skb_on_tcp_entail(void *conn, struct sk_buff *skb_head)
+{
+	on_tcp_entail_t on_tcp_entail = TFW_SKB_CB(skb_head)->on_tcp_entail;
+
+	if (on_tcp_entail)
+		on_tcp_entail(conn, skb_head);
+}
+
 typedef int ss_skb_actor_t(void *conn, unsigned char *data, unsigned int len,
 			   unsigned int *read);
 
@@ -92,6 +164,25 @@ ss_skb_queue_append(struct sk_buff **skb_head, struct sk_buff *skb)
 	tail->next = skb;
 }
 
+static inline void
+ss_skb_queue_splice(struct sk_buff **skb_head, struct sk_buff **skb)
+{
+	struct sk_buff *tail;
+
+	if ((!*skb_head)) {
+		swap(*skb_head, *skb);
+		return;
+	}
+
+	tail = (*skb_head)->prev;
+	(*skb_head)->prev = (*skb)->prev;
+	(*skb)->prev->next = (*skb_head);
+	tail->next = *skb;
+	(*skb)->prev = tail;
+
+	*skb = NULL;
+}
+
 static inline void
 ss_skb_remove(struct sk_buff *skb)
 {
@@ -169,6 +260,19 @@ ss_skb_insert_before(struct sk_buff **skb_head, struct sk_buff *skb,
 		*skb_head = nskb;
 }
 
+static inline void
+ss_skb_queue_head(struct sk_buff **skb_head, struct sk_buff *skb)
+{
+	/* The skb shouldn't be in any other queue. */
+	WARN_ON_ONCE(skb->next || skb->prev);
+	if (!*skb_head) {
+		*skb_head = skb;
+		skb->prev = skb->next = skb;
+		return;
+	}
+	ss_skb_insert_before(skb_head, *skb_head, skb);
+}
+
 /**
  * Almost a copy of standard skb_dequeue() except it works with skb list
  * instead of sk_buff_head. Several crucial data include skb list and we don't
@@ -294,6 +398,39 @@ ss_skb_move_frags(struct sk_buff *skb, struct sk_buff *nskb, int from,
 	ss_skb_adjust_data_len(nskb, e_size);
 }
 
+static inline char *
+ss_skb_data_ptr_by_offset(struct sk_buff *skb, unsigned int off)
+{
+	char *begin, *end;
+	unsigned long d;
+	unsigned char i;
+
+	if (skb_headlen(skb)) {
+		begin = skb->data;
+		end = begin + skb_headlen(skb);
+
+		if (begin + off <= end)
+			return begin + off;
+		off -= skb_headlen(skb);
+	}
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		skb_frag_t *f = &skb_shinfo(skb)->frags[i];
+
+		begin = skb_frag_address(f);
+		end = begin + skb_frag_size(f);
+		d = end - begin;
+
+		if (off > d) {
+			off -= d;
+			continue;
+		}
+		return begin + off;
+	}
+
+	return NULL;
+}
+
 #define SS_SKB_MAX_DATA_LEN	(SKB_MAX_HEADER + MAX_SKB_FRAGS * PAGE_SIZE)
 
 char *ss_skb_fmt_src_addr(const struct sk_buff *skb, char *out_buf);
diff --git a/fw/sync_socket.h b/fw/sync_socket.h
index ee23a70af..5539b00d3 100644
--- a/fw/sync_socket.h
+++ b/fw/sync_socket.h
@@ -2,7 +2,7 @@
  *		Synchronous Socket API.
  *
  * Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com).
- * Copyright (C) 2015-2023 Tempesta Technologies, Inc.
+ * Copyright (C) 2015-2024 Tempesta Technologies, Inc.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by
@@ -34,6 +34,12 @@ typedef struct ss_proto_t {
 	int			type;
 } SsProto;
 
+typedef enum {
+	SS_SEND,
+	SS_CLOSE,
+	SS_SHUTDOWN,
+} SsAction;
+
 /*
  * Flag bits definition for SsProto.type field.
  * NOTE: There are also flags definition for this
@@ -49,7 +55,7 @@ enum {
 	 * requests longer accepted (flag is intended
 	 * only for client connections).
 	 */
-	Conn_Stop		= 0x1 << __Flag_Bits,
+	Conn_Stop		= (0x1 << __Flag_Bits),
 	/*
 	 * Connection is in special state: we send FIN to
 	 * the client and wait until ACK to our FIN is come.
@@ -60,7 +66,7 @@ enum {
 	 * Connection is in special state: it socket is DEAD
 	 * and wait until ACK to our FIN is come.
 	 */
-	Conn_Closing		= 0x3 << __Flag_Bits,
+	Conn_Closing		= (0x3 << __Flag_Bits),
 };
 
 typedef struct tfw_conn_t TfwConn;
@@ -71,11 +77,11 @@ typedef struct ss_hooks {
 	int (*connection_new)(struct sock *sk);
 
 	/*
-	 * Intentional socket closing when the socket is already closed (i.e. there
-	 * could not be ingress data on it) and we can safely do some cleanup stuff
-	 * or error on TCP connection (on Linux TCP socket layer) associated with
-	 * the socket or at application (data processing) layer, i.e. unintentional
-	 * connection closing.
+	 * Intentional socket closing when the socket is already closed (i.e.
+	 * there could not be ingress data on it) and we can safely do some
+	 * cleanup stuff or error on TCP connection (on Linux TCP socket layer)
+	 * associated with the socket or at application (data processing)
+	 * layer, i.e. unintentional connection closing.
 	 * We need the callback since socket closing always has a chance to run
 	 * asynchronously on another CPU and a caller doesn't know when it
 	 * completes.
@@ -177,6 +183,9 @@ void ss_start(void);
 void ss_stop(void);
 bool ss_active(void);
 void ss_get_stat(SsStat *stat);
+void ss_skb_tcp_entail(struct sock *sk, struct sk_buff *skb, unsigned int mark,
+		       unsigned char tls_type);
+void ss_skb_tcp_entail_list(struct sock *sk, struct sk_buff **skb_head);
 
 #define SS_CALL(f, ...)							\
 	(sk->sk_user_data && ((SsProto *)(sk)->sk_user_data)->hooks->f	\
diff --git a/fw/t/unit/helpers.c b/fw/t/unit/helpers.c
index 3d5a1cf5b..91d7c8dbe 100644
--- a/fw/t/unit/helpers.c
+++ b/fw/t/unit/helpers.c
@@ -174,6 +174,15 @@ ss_stop(void)
 {
 }
 
+void ss_skb_tcp_entail(struct sock *sk, struct sk_buff *skb, unsigned int mark,
+		       unsigned char tls_type)
+{
+}
+
+void ss_skb_tcp_entail_list(struct sock *sk, struct sk_buff **skb)
+{
+}
+
 void
 tfw_client_set_expires_time(unsigned int expires_time)
 {
@@ -433,4 +442,10 @@ ttls_hs_done(TlsCtx *tls)
 	return true;
 }
 
+bool
+ttls_xfrm_need_encrypt(TlsCtx *tls)
+{
+	return true;
+}
+
 unsigned int cache_default_ttl = 60;
diff --git a/fw/t/unit/http2.c b/fw/t/unit/http2.c
new file mode 100644
index 000000000..8188b36b2
--- /dev/null
+++ b/fw/t/unit/http2.c
@@ -0,0 +1,556 @@
+/**
+ *		Tempesta FW
+ *
+ * Copyright (C) 2024 Tempesta Technologies, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+#undef DEBUG
+#if DBG_HTTP2 > 0
+#define DEBUG DBG_HTTP2
+#endif
+
+#include "connection.h"
+#include "http.h"
+#include "http2.h"
+#include "http_frame.h"
+#include "http_msg.h"
+
+#define TFW_MAX_CLOSED_STREAMS          5
+
+/**
+ * Usually client firstly send SETTINGS frame to a server, so:
+ * - we don't have many streams to iterate over in this function
+ *   (usually we have no streams at all).
+ * - typically there is only one SETTINGS_INITIAL_WINDOW_SIZE
+ *   frame is sent from a client side.
+ */
+static void
+tfw_h2_apply_wnd_sz_change(TfwH2Ctx *ctx, long int delta)
+{
+	TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2);
+	TfwStream *stream, *next;
+
+	/*
+	 * Order is no matter, use default funtion from the Linux kernel.
+	 * According to RFC 9113 6.9.2
+	 * When the value of SETTINGS_INITIAL_WINDOW_SIZE changes, a receiver
+	 * MUST adjust the size of all stream flow-control windows that it
+	 * maintains by the difference between the new value and the old value.
+	 * A change to SETTINGS_INITIAL_WINDOW_SIZE can cause the available
+	 * space in a flow-control window to become negative.
+	 */
+	rbtree_postorder_for_each_entry_safe(stream, next,
+					     &ctx->sched.streams, node) {
+		TfwStreamState state = tfw_h2_get_stream_state(stream);
+		if (state == HTTP2_STREAM_OPENED ||
+		    state == HTTP2_STREAM_REM_HALF_CLOSED) {
+			stream->rem_wnd += delta;
+			tfw_h2_stream_try_unblock(&ctx->sched, stream);
+			if (stream->rem_wnd > 0) {
+				sock_set_flag(((TfwConn *)conn)->sk,
+					      SOCK_TEMPESTA_HAS_DATA);
+			}
+		}
+	}
+}
+
+static void
+tfw_h2_apply_settings_entry(TfwH2Ctx *ctx, unsigned short id,
+			    unsigned int val)
+{
+	TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2);
+	TfwSettings *dest = &ctx->rsettings;
+	long int delta;
+
+	switch (id) {
+	case HTTP2_SETTINGS_TABLE_SIZE:
+		assert_spin_locked(&((TfwConn *)conn)->sk->sk_lock.slock);
+		dest->hdr_tbl_sz = min_t(unsigned int,
+					 val, HPACK_ENC_TABLE_MAX_SIZE);
+		tfw_hpack_set_rbuf_size(&ctx->hpack.enc_tbl, dest->hdr_tbl_sz);
+		break;
+
+	case HTTP2_SETTINGS_ENABLE_PUSH:
+		BUG_ON(val > 1);
+		dest->push = val;
+		break;
+
+	case HTTP2_SETTINGS_MAX_STREAMS:
+		dest->max_streams = val;
+		break;
+
+	case HTTP2_SETTINGS_INIT_WND_SIZE:
+		BUG_ON(val > MAX_WND_SIZE);
+		delta = (long int)val - (long int)dest->wnd_sz;
+		tfw_h2_apply_wnd_sz_change(ctx, delta);
+		dest->wnd_sz = val;
+		break;
+
+	case HTTP2_SETTINGS_MAX_FRAME_SIZE:
+		BUG_ON(val < FRAME_DEF_LENGTH || val > FRAME_MAX_LENGTH);
+		dest->max_frame_sz = val;
+		break;
+
+	case HTTP2_SETTINGS_MAX_HDR_LIST_SIZE:
+		dest->max_lhdr_sz = val;
+		break;
+
+	default:
+		/*
+		 * We should silently ignore unknown identifiers (see
+		 * RFC 9113 section 6.5.2)
+		 */
+		break;
+	}
+}
+
+int
+tfw_h2_check_settings_entry(TfwH2Ctx *ctx, unsigned short id, unsigned int val)
+{
+	TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2);
+
+	assert_spin_locked(&((TfwConn *)conn)->sk->sk_lock.slock);
+
+	switch (id) {
+	case HTTP2_SETTINGS_TABLE_SIZE:
+		break;
+
+	case HTTP2_SETTINGS_ENABLE_PUSH:
+		if (val > 1)
+			return -EINVAL;
+		break;
+
+	case HTTP2_SETTINGS_MAX_STREAMS:
+		break;
+
+	case HTTP2_SETTINGS_INIT_WND_SIZE:
+		if (val > MAX_WND_SIZE)
+			return -EINVAL;
+		break;
+
+	case HTTP2_SETTINGS_MAX_FRAME_SIZE:
+		if (val < FRAME_DEF_LENGTH || val > FRAME_MAX_LENGTH)
+			return -EINVAL;
+		break;
+
+	case HTTP2_SETTINGS_MAX_HDR_LIST_SIZE:
+		break;
+
+	default:
+		/*
+		 * We should silently ignore unknown identifiers (see
+		 * RFC 9113 section 6.5.2)
+		 */
+		break;
+	}
+
+	return 0;
+}
+
+void
+tfw_h2_save_settings_entry(TfwH2Ctx *ctx, unsigned short id, unsigned int val)
+{
+	TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2);
+
+	assert_spin_locked(&((TfwConn *)conn)->sk->sk_lock.slock);
+
+	if (id > 0 && id < _HTTP2_SETTINGS_MAX) {
+		ctx->new_settings[id - 1] = val;
+		__set_bit(id, ctx->settings_to_apply);
+		__set_bit(HTTP2_SETTINGS_NEED_TO_APPLY,
+			  ctx->settings_to_apply);
+	}
+}
+
+void
+tfw_h2_apply_new_settings(TfwH2Ctx *ctx)
+{
+	TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2);
+	unsigned int id;
+
+	assert_spin_locked(&((TfwConn *)conn)->sk->sk_lock.slock);
+
+	for (id = HTTP2_SETTINGS_TABLE_SIZE; id < _HTTP2_SETTINGS_MAX; id++) {
+		if (test_bit(id, ctx->settings_to_apply)) {
+			unsigned int val = ctx->new_settings[id - 1];
+			tfw_h2_apply_settings_entry(ctx, id, val);
+		}
+	}
+	clear_bit(HTTP2_SETTINGS_NEED_TO_APPLY, ctx->settings_to_apply);
+}
+
+int
+tfw_h2_init(void)
+{
+	return tfw_h2_stream_cache_create();
+}
+
+void
+tfw_h2_cleanup(void)
+{
+	tfw_h2_stream_cache_destroy();
+}
+
+int
+tfw_h2_context_init(TfwH2Ctx *ctx)
+{
+	TfwStreamQueue *closed_streams = &ctx->closed_streams;
+	TfwStreamQueue *idle_streams = &ctx->idle_streams;
+	TfwSettings *lset = &ctx->lsettings;
+	TfwSettings *rset = &ctx->rsettings;
+
+	bzero_fast(ctx, sizeof(*ctx));
+
+	ctx->state = HTTP2_RECV_CLI_START_SEQ;
+	ctx->loc_wnd = DEF_WND_SIZE;
+	ctx->rem_wnd = DEF_WND_SIZE;
+
+	spin_lock_init(&ctx->lock);
+	INIT_LIST_HEAD(&closed_streams->list);
+	INIT_LIST_HEAD(&idle_streams->list);
+
+	tfw_h2_init_stream_sched(&ctx->sched);
+
+	lset->hdr_tbl_sz = rset->hdr_tbl_sz = HPACK_TABLE_DEF_SIZE;
+	lset->push = rset->push = 1;
+	lset->max_streams = tfw_cli_max_concurrent_streams;
+	rset->max_streams = 0xffffffff;
+	lset->max_frame_sz = rset->max_frame_sz = FRAME_DEF_LENGTH;
+	lset->max_lhdr_sz = max_header_list_size ?
+		max_header_list_size : UINT_MAX;
+	rset->max_lhdr_sz = UINT_MAX;
+
+	lset->wnd_sz = DEF_WND_SIZE;
+	rset->wnd_sz = DEF_WND_SIZE;
+
+	return tfw_hpack_init(&ctx->hpack, HPACK_TABLE_DEF_SIZE);
+}
+ALLOW_ERROR_INJECTION(tfw_h2_context_init, ERRNO);
+
+void
+tfw_h2_context_clear(TfwH2Ctx *ctx)
+{
+	WARN_ON_ONCE(ctx->streams_num);
+	/*
+	 * Free POSTPONED SKBs. This is necessary when h2 context has
+	 * postponed frames and connection closing initiated.
+	 */
+	ss_skb_queue_purge(&ctx->skb_head);
+	tfw_hpack_clean(&ctx->hpack);
+}
+
+void
+tfw_h2_conn_terminate_close(TfwH2Ctx *ctx, TfwH2Err err_code, bool close,
+			    bool attack)
+{
+	TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2);
+
+	if (tfw_h2_send_goaway(ctx, err_code, attack) && close) {
+		if (attack)
+			tfw_connection_close((TfwConn *)conn, true);
+		else
+			tfw_connection_shutdown((TfwConn *)conn, true);
+	}
+}
+
+/**
+ * According to RFC 9113 section 5.1.1:
+ * The first use of a new stream identifier implicitly closes all
+ * streams in the "idle" state that might have been initiated by that
+ * peer with a lower-valued stream identifier.
+ */
+void
+tfw_h2_remove_idle_streams(TfwH2Ctx *ctx, unsigned int id)
+{
+	TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2);
+	TfwStream *stream, *tmp;
+
+	/*
+	 * We add and remove streams from idle queue under
+	 * socket lock.
+	 */
+	assert_spin_locked(&((TfwConn *)conn)->sk->sk_lock.slock);
+
+	list_for_each_entry_safe_reverse(stream, tmp, &ctx->idle_streams.list,
+					 hcl_node)
+	{
+		if (id <= stream->id)
+			break;
+
+		tfw_h2_stream_del_from_queue_nolock(stream);
+		tfw_h2_set_stream_state(stream, HTTP2_STREAM_CLOSED);
+		tfw_h2_stream_add_closed(ctx, stream);
+	}
+}
+
+void
+tfw_h2_conn_streams_cleanup(TfwH2Ctx *ctx)
+{
+	TfwStream *cur, *next;
+	TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2);
+	TfwStreamSched *sched = &ctx->sched;
+
+	WARN_ON_ONCE(((TfwConn *)conn)->stream.msg);
+
+	T_DBG3("%s: ctx [%p] conn %p sched %p\n", __func__, ctx, conn, sched);
+
+	tfw_h2_remove_idle_streams(ctx, UINT_MAX);
+
+        rbtree_postorder_for_each_entry_safe(cur, next, &sched->streams, node) {
+		tfw_h2_stream_purge_all_and_free_response(cur);
+		tfw_h2_stream_unlink_lock(ctx, cur);
+
+		/* The streams tree is about to be destroyed and
+		 * we don't want to trigger rebalancing.
+		 * No further actions regarding streams dependencies/prio
+		 * is required at this stage.
+		 */
+		tfw_h2_delete_stream(cur);
+		--ctx->streams_num;
+	}
+	sched->streams = RB_ROOT;
+}
+
+void
+tfw_h2_current_stream_remove(TfwH2Ctx *ctx)
+{
+	T_DBG3("%s: ctx [%p] ctx->cur_stream %p\n", __func__,
+	       ctx, ctx->cur_stream);
+	tfw_h2_stream_unlink_lock(ctx, ctx->cur_stream);
+	tfw_h2_stream_clean(ctx, ctx->cur_stream);
+	ctx->cur_stream = NULL;
+}
+
+/*
+ * Clean the queue of closed streams if its size has exceeded a certain
+ * value.
+ */
+void
+tfw_h2_closed_streams_shrink(TfwH2Ctx *ctx)
+{
+	TfwStream *cur;
+	TfwStreamQueue *closed_streams = &ctx->closed_streams;
+
+	T_DBG3("%s: ctx [%p] closed streams num %lu\n", __func__, ctx,
+	       closed_streams->num);
+
+	while (1) {
+		spin_lock(&ctx->lock);
+
+		if (closed_streams->num <= TFW_MAX_CLOSED_STREAMS) {
+			spin_unlock(&ctx->lock);
+			break;
+		}
+
+		BUG_ON(list_empty(&closed_streams->list));
+		cur = list_first_entry(&closed_streams->list, TfwStream,
+				       hcl_node);
+		tfw_h2_stream_unlink_nolock(ctx, cur);
+
+		spin_unlock(&ctx->lock);
+
+		T_DBG3("%s: ctx [%p] cur stream [%p]\n", __func__, ctx, cur);
+
+		tfw_h2_stream_clean(ctx, cur);
+	}
+}
+
+void
+tfw_h2_check_current_stream_is_closed(TfwH2Ctx *ctx)
+{
+	BUG_ON(!ctx->cur_stream);
+
+	T_DBG3("%s: strm [%p] id %u state %d(%s), streams_num %lu\n",
+	       __func__, ctx->cur_stream, ctx->cur_stream->id,
+	       tfw_h2_get_stream_state(ctx->cur_stream),
+	       __h2_strm_st_n(ctx->cur_stream), ctx->streams_num);
+
+	if (tfw_h2_stream_is_closed(ctx->cur_stream))
+		tfw_h2_current_stream_remove(ctx);
+}
+
+TfwStream *
+tfw_h2_find_not_closed_stream(TfwH2Ctx *ctx, unsigned int id, bool recv)
+{
+	TfwStream *stream;
+
+	stream = tfw_h2_find_stream(&ctx->sched, id);
+	return stream && !tfw_h2_stream_is_closed(stream) ? stream : NULL;
+}
+
+/*
+ * Get stream ID for upper layer to create frames info.
+ */
+unsigned int
+tfw_h2_req_stream_id(TfwHttpReq *req)
+{
+	unsigned int id = 0;
+	TfwH2Ctx *ctx = tfw_h2_context_unsafe(req->conn);
+
+	spin_lock(&ctx->lock);
+
+	if (req->stream)
+		id = req->stream->id;
+
+	spin_unlock(&ctx->lock);
+
+	return id;
+}
+
+/*
+ * Unlink request from corresponding stream (if linked).
+ */
+void
+tfw_h2_req_unlink_stream(TfwHttpReq *req)
+{
+	TfwStream *stream;
+	TfwH2Ctx *ctx = tfw_h2_context_unsafe(req->conn);
+
+	spin_lock(&ctx->lock);
+
+	stream = req->stream;
+	if (!stream) {
+		spin_unlock(&ctx->lock);
+		return;
+	}
+
+	req->stream = NULL;
+	stream->msg = NULL;
+
+	spin_unlock(&ctx->lock);
+}
+
+/*
+ * Unlink request from corresponding stream (if linked),
+ * send RST STREAM and add stream to closed queue.
+ */
+void
+tfw_h2_req_unlink_stream_with_rst(TfwHttpReq *req)
+{
+	TfwStreamFsmRes r;
+	TfwStream *stream;
+	TfwH2Ctx *ctx = tfw_h2_context_unsafe(req->conn);
+
+	spin_lock(&ctx->lock);
+
+	stream = req->stream;
+	if (!stream) {
+		spin_unlock(&ctx->lock);
+		return;
+	}
+
+	req->stream = NULL;
+	stream->msg = NULL;
+
+	r = tfw_h2_stream_fsm_ignore_err(ctx, stream, HTTP2_RST_STREAM, 0);
+	WARN_ON_ONCE(r != STREAM_FSM_RES_OK && r != STREAM_FSM_RES_IGNORE);
+
+	tfw_h2_stream_add_to_queue_nolock(&ctx->closed_streams, stream);
+
+	spin_unlock(&ctx->lock);
+}
+
+int
+tfw_h2_stream_xmit_prepare_resp(TfwStream *stream)
+{
+	TfwHttpResp *resp = stream->xmit.resp;
+	unsigned char tls_type;
+	unsigned int mark;
+	int r = 0;
+
+	BUG_ON(!resp || resp->msg.skb_head || !resp->req
+	       || !resp->req->conn || !stream->xmit.skb_head);
+
+	tls_type = skb_tfw_tls_type(stream->xmit.skb_head);
+	mark = stream->xmit.skb_head->mark;
+	swap(resp->msg.skb_head, stream->xmit.skb_head);
+
+	r = tfw_h2_resp_encode_headers(resp);
+	if (unlikely(r)) {
+		T_WARN("Failed to encode headers");
+		goto finish;
+	}
+
+	stream->xmit.h_len = resp->mit.acc_len;
+	stream->xmit.b_len = TFW_HTTP_RESP_CUT_BODY_SZ(resp);
+	if (test_bit(TFW_HTTP_B_CHUNKED, resp->flags))
+		r = tfw_http_msg_cutoff_body_chunks(resp);
+
+finish:
+	swap(stream->xmit.skb_head, resp->msg.skb_head);
+	ss_skb_setup_head_of_list(stream->xmit.skb_head, mark, tls_type);
+
+	return r;
+}
+
+int
+tfw_h2_entail_stream_skb(struct sock *sk, TfwH2Ctx *ctx, TfwStream *stream,
+			 unsigned int *len, bool should_split)
+{
+	unsigned char tls_type = skb_tfw_tls_type(stream->xmit.skb_head);
+	unsigned int mark = stream->xmit.skb_head->mark;
+	struct sk_buff *skb, *split;
+	int r = 0;
+
+	BUG_ON(!TFW_SKB_CB(stream->xmit.skb_head)->is_head);
+	while (*len) {
+		skb = ss_skb_dequeue(&stream->xmit.skb_head);
+		BUG_ON(!skb);
+
+		if (unlikely(!skb->len)) {
+			T_DBG3("[%d]: %s: drop skb=%px data_len=%u len=%u\n",
+			       smp_processor_id(), __func__,
+			       skb, skb->data_len, skb->len);
+			kfree_skb(skb);
+			continue;
+		}
+
+		BUG_ON(!tls_type);
+		BUG_ON(!skb->len);
+
+		if (skb->len > *len) {
+			if (should_split) {
+				split = ss_skb_split(skb, *len);
+				if (!split) {
+					ss_skb_queue_head(&stream->xmit.skb_head,
+							  skb);
+					r = -ENOMEM;
+					break;
+				}
+
+				ss_skb_queue_head(&stream->xmit.skb_head, split);
+			} else {
+				ss_skb_queue_head(&stream->xmit.skb_head, skb);
+				break;
+			}
+		}
+		*len -= skb->len;
+		 ss_skb_tcp_entail(sk, skb, mark, tls_type);
+	}
+
+	/*
+	 * We use tls_type and mark from skb_head when we entail data in
+	 * socket write queue. So we should set tls_type and mark for the
+	 * new skb_head.
+	 */
+	if (stream->xmit.skb_head
+	    && !TFW_SKB_CB(stream->xmit.skb_head)->is_head) {
+		ss_skb_setup_head_of_list(stream->xmit.skb_head, mark,
+					  tls_type);
+	}
+
+	return r;
+}
diff --git a/fw/t/unit/test.c b/fw/t/unit/test.c
index fb6c82fb1..efa18e2a2 100644
--- a/fw/t/unit/test.c
+++ b/fw/t/unit/test.c
@@ -2,7 +2,7 @@
  *		Tempesta FW
  *
  * Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com).
- * Copyright (C) 2015-2021 Tempesta Technologies, Inc.
+ * Copyright (C) 2015-2024 Tempesta Technologies, Inc.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by
@@ -99,6 +99,7 @@ TEST_SUITE(wq);
 TEST_SUITE(tls);
 TEST_SUITE(hpack);
 TEST_SUITE(pool);
+TEST_SUITE(ebtree);
 
 extern int tfw_pool_init(void);
 extern void tfw_pool_exit(void);
@@ -156,6 +157,9 @@ test_run_all(void)
 	TEST_SUITE_RUN(pool);
 	__fpu_schedule();
 
+	TEST_SUITE_RUN(ebtree);
+	__fpu_schedule();
+
 	kernel_fpu_end();
 
 	tfw_pool_exit();
diff --git a/fw/t/unit/test_ebtree.c b/fw/t/unit/test_ebtree.c
new file mode 100644
index 000000000..de50327db
--- /dev/null
+++ b/fw/t/unit/test_ebtree.c
@@ -0,0 +1,52 @@
+#include "test.h"
+#include "helpers.h"
+
+#include "lib/eb64tree.h"
+
+#define EB64_NODES_MAX 1000
+static struct eb64_node nodes[EB64_NODES_MAX];
+
+static unsigned long
+find_min_key(struct eb64_node *nodes, int size)
+{
+	unsigned long min = nodes[0].key;
+	unsigned int i;
+	
+	for (i = 1; i < size; i++) {
+		if (nodes[i].key < min)
+			min = nodes[i].key;
+	}
+
+	return min;
+}
+
+TEST(ebtree, extract_min)
+{
+	struct eb_root tree = EB_ROOT;
+	struct eb64_node *root;
+	unsigned long min;
+	unsigned int i;
+
+	for (i = 0; i < EB64_NODES_MAX; i++) {
+		nodes[i].key = get_random_long();
+		eb64_insert(&tree, &nodes[i]);
+	}
+
+	for (i = 0; i < EB64_NODES_MAX; i++) {
+		/*
+		 * Find minimal node using linear search and compare
+		 * it with the minimal value from the tree.
+		 */
+		min = find_min_key(nodes, EB64_NODES_MAX);
+		root = eb64_first(&tree);
+		EXPECT_EQ(root->key, min);
+		eb64_delete(root);
+		root->key = get_random_long();
+		eb64_insert(&tree, root);
+	}
+}
+
+TEST_SUITE(ebtree)
+{
+	TEST_RUN(ebtree, extract_min);
+}
diff --git a/fw/t/unit/test_hpack.c b/fw/t/unit/test_hpack.c
index 64ea1e9f5..5428f5865 100644
--- a/fw/t/unit/test_hpack.c
+++ b/fw/t/unit/test_hpack.c
@@ -1,7 +1,7 @@
 /**
  *		Tempesta FW
  *
- * Copyright (C) 2019-2023 Tempesta Technologies, Inc.
+ * Copyright (C) 2019-2024 Tempesta Technologies, Inc.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by
@@ -20,6 +20,7 @@
 #include "hpack.c"
 #define tfw_connection_send(a, b) 0
 #include "http_stream.c"
+#include "http_stream_sched.c"
 #include "http_frame.c"
 #include "http.c"
 
diff --git a/fw/tcp.h b/fw/tcp.h
index 3461cb9c3..c315daaba 100644
--- a/fw/tcp.h
+++ b/fw/tcp.h
@@ -1,7 +1,7 @@
 /**
  *		TCP Socket API.
  *
- * Copyright (C) 2015-2023 Tempesta Technologies, Inc.
+ * Copyright (C) 2015-2024 Tempesta Technologies, Inc.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by
@@ -24,6 +24,41 @@
 
 void tfw_tcp_propagate_dseq(struct sock *sk, struct sk_buff *skb);
 void tfw_tcp_setup_new_skb(struct sock *sk, struct sk_buff *skb,
-                           struct sk_buff *nskb, unsigned int mss_now);
+			   struct sk_buff *nskb, unsigned int mss_now);
+
+/*
+ * Calculate window size to send in bytes. We calculate the sender
+ * and receiver window and select the smallest of them.
+ * We ajust also @not_account_in_flight counf of skbs, which were
+ * previously pushed to socket write queue. In `tcp_write_xmit`
+ * main loop cong_win is calculated on each loop iteration and
+ * if we calculate `cong_win` for making frames without taking
+ * into account previously pushed skbs we push more data into
+ * socket write queue then we can send.
+ */
+static inline unsigned long
+tfw_tcp_calc_snd_wnd(struct sock *sk, unsigned int mss_now)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	unsigned int in_flight = tcp_packets_in_flight(tp);
+	unsigned int qlen =  skb_queue_len(&sk->sk_write_queue);
+	unsigned int send_win, cong_win;
+
+	/*
+	 * Update snd_cwnd if nedeed, to correct caclulation
+	 * of count of bytes to send.
+	 */
+	tcp_slow_start_after_idle_check(sk);
+
+	if (in_flight + qlen >= tp->snd_cwnd)
+		return 0;
+
+	if (after(tp->write_seq, tcp_wnd_end(tp)))
+		return 0;
+
+	cong_win = (tp->snd_cwnd - in_flight - qlen) * mss_now;
+	send_win = tcp_wnd_end(tp) - tp->write_seq;
+	return min(cong_win, send_win);
+}
 
 #endif /* __TFW_TCP_H__ */
diff --git a/fw/tls.c b/fw/tls.c
index aff86c978..efb6b6d95 100644
--- a/fw/tls.c
+++ b/fw/tls.c
@@ -3,7 +3,7 @@
  *
  * Transport Layer Security (TLS) interfaces to Tempesta TLS.
  *
- * Copyright (C) 2015-2023 Tempesta Technologies, Inc.
+ * Copyright (C) 2015-2024 Tempesta Technologies, Inc.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by
@@ -233,20 +233,21 @@ tfw_tls_connection_recv(TfwConn *conn, struct sk_buff *skb)
  * can add the next skb in the send queue to the current encrypted TLS record.
  *
  * We extend the skbs on TCP transmission (when CWND is calculated), so we
- * also adjust TCP sequence numbers in the socket. See skb_entail().
+ * also adjust TCP sequence numbers in the socket. See tcp_skb_entail().
  */
 int
 tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int mss_now,
-		unsigned int limit, unsigned int nskbs)
+		unsigned int limit)
 {
 	/*
 	 * TODO #1103 currently even trivial 500-bytes HTTP message generates
 	 * 6 segment skb. After the fix the number probably should be decreased.
 	 */
 #define AUTO_SEGS_N	8
+#define MAX_SEG_N	64
 
 	int r = -ENOMEM;
-	unsigned int head_sz, len, frags, t_sz, out_frags, i = 0;
+	unsigned int head_sz, len, frags, t_sz, out_frags, next_nents;
 	unsigned char type;
 	struct sk_buff *next = skb, *skb_tail = skb;
 	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
@@ -267,7 +268,7 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int mss_now,
 	xfrm = &tls->xfrm;
 
 	T_DBG3("%s: sk=%pK(snd_una=%u snd_nxt=%u limit=%u)"
-	       " skb=%pK(len=%u data_len=%u type=%u frags=%u headlen=%u"
+	       " skb=%px(len=%u data_len=%u type=%u frags=%u headlen=%u"
 	       " seq=%u:%u)\n", __func__,
 	       sk, tcp_sk(sk)->snd_una, tcp_sk(sk)->snd_nxt, limit,
 	       skb, skb->len, skb->data_len, skb_tfw_tls_type(skb),
@@ -279,7 +280,7 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int mss_now,
 		     != tcb->end_seq);
 
 	head_sz = ttls_payload_off(xfrm);
-	len = head_sz + skb->len + TTLS_TAG_LEN;
+	len = skb->len;
 	type = skb_tfw_tls_type(skb);
 	if (!type) {
 		T_WARN("%s: bad skb type %u\n", __func__, type);
@@ -291,10 +292,11 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int mss_now,
 	tcb->end_seq += head_sz;
 
 	/* Try to aggregate several skbs into one TLS record. */
-	while (!tcp_skb_is_last(sk, skb_tail) && i++ < nskbs - 1) {
+	while (!tcp_skb_is_last(sk, skb_tail)) {
 		next = skb_queue_next(&sk->sk_write_queue, skb_tail);
+		next_nents = skb_shinfo(next)->nr_frags + !!skb_headlen(next);
 
-		T_DBG3("next skb (%pK) in write queue: len=%u frags=%u/%u"
+		T_DBG3("next skb (%px) in write queue: len=%u frags=%u/%u"
 		       " type=%u seq=%u:%u\n",
 		       next, next->len, skb_shinfo(next)->nr_frags,
 		       !!skb_headlen(next), skb_tfw_tls_type(next),
@@ -302,6 +304,8 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int mss_now,
 
 		if (len + next->len > limit)
 			break;
+		if (unlikely(sgt.nents + next_nents > MAX_SEG_N))
+			break;
 		/* Don't put different message types into the same record. */
 		if (type != skb_tfw_tls_type(next))
 			break;
@@ -313,11 +317,13 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int mss_now,
 		tfw_tcp_propagate_dseq(sk, skb_tail);
 
 		len += next->len;
-		sgt.nents += skb_shinfo(next)->nr_frags + !!skb_headlen(next);
-		out_sgt.nents += skb_shinfo(next)->nr_frags + !!skb_headlen(next);
+		sgt.nents += next_nents;
+		out_sgt.nents += next_nents;
 		skb_tail = next;
 	}
 
+	len += head_sz + TTLS_TAG_LEN;
+
 	/*
 	 * Use skb_tail->next as skb_head in __extend_pgfrags() to not try to
 	 * put TAG to the next skb, which is out of our limit. In worst case,
@@ -485,6 +491,7 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int mss_now,
 	       __func__, r);
 	return r;
 #undef AUTO_SEGS_N
+#undef MAX_SEG_N
 }
 
 static inline int
@@ -513,6 +520,31 @@ tfw_tls_close_msg_flags(TlsIOCtx *io)
 	return flags;
 }
 
+static inline int
+tfw_tls_on_send_alert(void *conn, struct sk_buff **skb_head)
+{
+	TfwH2Ctx *ctx;
+
+	BUG_ON(TFW_CONN_PROTO((TfwConn *)conn) != TFW_FSM_H2);
+	ctx = tfw_h2_context_safe((TfwConn *)conn);
+	if (!ctx)
+		return 0;
+
+	if (ctx->error && ctx->error->xmit.skb_head) {
+		ss_skb_queue_splice(&ctx->error->xmit.skb_head, skb_head);
+	} else if (ctx->cur_send_headers) {
+		/*
+		 * Other frames (from any stream) MUST NOT occur between
+		 * the HEADERS frame and any CONTINUATION frames that might
+		 * follow. Send TLS alert later.
+		 */
+		ctx->error = ctx->cur_send_headers;
+		ss_skb_queue_splice(&ctx->error->xmit.skb_head, skb_head);
+	}
+
+	return 0;
+}
+
 /**
  * Callback function which is called by TLS module under tls->lock when it
  * initiates a record transmission, e.g. alert or a handshake message.
@@ -589,6 +621,10 @@ tfw_tls_send(TlsCtx *tls, struct sg_table *sgt)
 	     io->alert[0] == TTLS_ALERT_LEVEL_FATAL)) {
 		TFW_CONN_TYPE(((TfwConn *)conn)) |= Conn_Stop;
 		flags |= tfw_tls_close_msg_flags(io);
+		if (TFW_CONN_PROTO((TfwConn *)conn) == TFW_FSM_H2) {
+			TFW_SKB_CB(io->skb_list)->on_send =
+				tfw_tls_on_send_alert;
+		}
 	}
 
 	r = ss_send(conn->cli_conn.sk, &io->skb_list, flags);
@@ -611,10 +647,9 @@ tfw_tls_conn_dtor(void *c)
 {
 	struct sk_buff *skb;
 	TlsCtx *tls = tfw_tls_context(c);
-	bool hs_was_done = false;
 
 	if (TFW_CONN_PROTO((TfwConn *)c) == TFW_FSM_H2
-	    && (hs_was_done = ttls_hs_done(tls)))
+	    && ttls_hs_done(tls))
 		tfw_h2_context_clear(tfw_h2_context_unsafe(c));
 
 	if (tls) {
@@ -957,15 +992,16 @@ tfw_tls_over(TlsCtx *tls, int state)
 {
 	int sk_proto = ((SsProto *)tls->sk->sk_user_data)->type;
 	TfwConn *conn = (TfwConn*)tls->sk->sk_user_data;
+	int r;
 
 	if (state == TTLS_HS_CB_FINISHED_NEW
 	    || state == TTLS_HS_CB_FINISHED_RESUMED)
 		TFW_INC_STAT_BH(serv.tls_hs_successful);
 
 	if (TFW_FSM_TYPE(sk_proto) == TFW_FSM_H2 &&
-	    tfw_h2_context_init(tfw_h2_context_unsafe(conn))) {
+	    ((r = tfw_h2_context_init(tfw_h2_context_unsafe(conn))))) {
 		    T_ERR("cannot establish a new h2 connection\n");
-		    return T_DROP;
+		    return r;
 	}
 
 	return frang_tls_handler(tls, state);
diff --git a/fw/tls.h b/fw/tls.h
index db2536258..736517bf4 100644
--- a/fw/tls.h
+++ b/fw/tls.h
@@ -1,7 +1,7 @@
 /**
  *		Tempesta FW
  *
- * Copyright (C) 2015-2022 Tempesta Technologies, Inc.
+ * Copyright (C) 2015-2024 Tempesta Technologies, Inc.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by
@@ -31,7 +31,7 @@ void tfw_tls_cfg_configured(bool global);
 void tfw_tls_set_allow_any_sni(bool match);
 int tfw_tls_cfg_alpn_protos(const char *cfg_str);
 int tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int mss_now,
-		    unsigned int limit, unsigned int nskbs);
+		    unsigned int limit);
 
 typedef struct tfw_conn_t TfwConn;
 int tfw_tls_connection_recv(TfwConn *conn, struct sk_buff *skb);
diff --git a/fw/work_queue.c b/fw/work_queue.c
index 89ceaf9fc..f978f2cda 100644
--- a/fw/work_queue.c
+++ b/fw/work_queue.c
@@ -5,7 +5,7 @@
  * complicated MPMC case at http://www.linuxjournal.com/content/lock-free- \
  * multi-producer-multi-consumer-queue-ring-buffer .
  *
- * Copyright (C) 2016-2018 Tempesta Technologies, Inc.
+ * Copyright (C) 2016-2024 Tempesta Technologies, Inc.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by
@@ -36,7 +36,7 @@ tfw_wq_init(TfwRBQueue *q, size_t qsize, int node)
 	if (!q->heads)
 		return -ENOMEM;
 
-	for_each_possible_cpu(cpu) {
+	for_each_online_cpu(cpu) {
 		atomic64_t *local_head = per_cpu_ptr(q->heads, cpu);
 		atomic64_set(local_head, LLONG_MAX);
 	}
diff --git a/ktest/linux/percpu.h b/ktest/linux/percpu.h
index 80e8971af..6db559c29 100644
--- a/ktest/linux/percpu.h
+++ b/ktest/linux/percpu.h
@@ -1,7 +1,7 @@
 /**
  *	Tempesta kernel emulation unit testing framework.
  *
- * Copyright (C) 2015-2020 Tempesta Technologies, Inc.
+ * Copyright (C) 2015-2024 Tempesta Technologies, Inc.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by
@@ -33,7 +33,7 @@
 #define alloc_percpu(t)			calloc(NR_CPUS, sizeof(t))
 #define __alloc_percpu(s, a)		calloc(NR_CPUS, (s))
 #define free_percpu(p)			free(p)
-#define for_each_possible_cpu(c)	for (c = 0; c < NR_CPUS; ++c)
+#define for_each_online_cpu(c)		for (c = 0; c < NR_CPUS; ++c)
 
 #if NR_CPUS == 1
 
diff --git a/lib/Makefile b/lib/Makefile
index f7991a420..a4d6d9fce 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -1,6 +1,6 @@
 #		Tempesta DB
 #
-# Copyright (C) 2018-2019 Tempesta Technologies, Inc.
+# Copyright (C) 2018-2024 Tempesta Technologies, Inc.
 #
 # This program is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by
@@ -22,7 +22,7 @@ obj-m	= tempesta_lib.o
 
 GCOV_PROFILE := $(TFW_GCOV)
 
-tempesta_lib-objs = hash.o main.o
+tempesta_lib-objs = hash.o main.o ebtree.o eb64tree.o
 ifdef AVX2
 	tempesta_lib-objs += str_simd.o
 endif
diff --git a/lib/eb64tree.c b/lib/eb64tree.c
new file mode 100755
index 000000000..a6a421508
--- /dev/null
+++ b/lib/eb64tree.c
@@ -0,0 +1,35 @@
+/*
+ * Elastic Binary Trees - exported functions for operations on 64bit nodes.
+ *
+ * Copyright (C) 2000-2015 Willy Tarreau - w@1wt.eu
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/* Consult eb64tree.h for more details about those functions */
+
+#include "eb64tree.h"
+
+struct eb64_node *eb64_insert(struct eb_root *root, struct eb64_node *new)
+{
+	return __eb64_insert(root, new);
+}
+EXPORT_SYMBOL(eb64_insert);
diff --git a/lib/eb64tree.h b/lib/eb64tree.h
new file mode 100755
index 000000000..81ba27bec
--- /dev/null
+++ b/lib/eb64tree.h
@@ -0,0 +1,273 @@
+/*
+ * Elastic Binary Trees - macros and structures for operations on 64bit nodes.
+ *
+ * Copyright (C) 2000-2015 Willy Tarreau - w@1wt.eu
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _EB64TREE_H
+#define _EB64TREE_H
+
+#include "ebtree.h"
+
+#include <linux/types.h>
+
+
+/* Return the structure of type <type> whose member <member> points to <ptr> */
+#define eb64_entry(ptr, type, member) container_of(ptr, type, member)
+
+#define EB64_ROOT	EB_ROOT
+#define EB64_TREE_HEAD	EB_TREE_HEAD
+
+/* This structure carries a node, a leaf, and a key. It must start with the
+ * eb_node so that it can be cast into an eb_node. We could also have put some
+ * sort of transparent union here to reduce the indirection level, but the fact
+ * is, the end user is not meant to manipulate internals, so this is pointless.
+ * In case sizeof(void*)>=sizeof(u64), we know there will be some padding after
+ * the key if it's unaligned. In this case we force the alignment on void* so
+ * that we prefer to have the padding before for more efficient accesses.
+ */
+struct eb64_node {
+	struct eb_node node; /* the tree node, must be at the beginning */
+	MAYBE_ALIGN(sizeof(u64));
+	ALWAYS_ALIGN(sizeof(void*));
+	u64 key;
+} ALIGNED(sizeof(void*));
+
+/*
+ * Exported functions and macros.
+ * Many of them are always inlined because they are extremely small, and
+ * are generally called at most once or twice in a program.
+ */
+
+/* Return leftmost node in the tree, or NULL if none */
+static inline struct eb64_node *eb64_first(struct eb_root *root)
+{
+	return eb64_entry(eb_first(root), struct eb64_node, node);
+}
+
+/* Delete node from the tree if it was linked in. Mark the node unused. Note
+ * that this function relies on a non-inlined generic function: eb_delete.
+ */
+static inline void eb64_delete(struct eb64_node *eb64)
+{
+	eb_delete(&eb64->node);
+}
+
+/*
+ * The following functions are not inlined by default. They are declared
+ * in eb64tree.c, which simply relies on their inline version.
+ */
+struct eb64_node *eb64_insert(struct eb_root *root, struct eb64_node *new);
+
+/*
+ * The following functions are less likely to be used directly, because their
+ * code is larger. The non-inlined version is preferred.
+ */
+
+/* Delete node from the tree if it was linked in. Mark the node unused. */
+static forceinline void __eb64_delete(struct eb64_node *eb64)
+{
+	__eb_delete(&eb64->node);
+}
+
+/* Insert eb64_node <new> into subtree starting at node root <root>.
+ * Only new->key needs be set with the key. The eb64_node is returned.
+ * If root->b[EB_RGHT]==1, the tree may only contain unique keys.
+ */
+static forceinline struct eb64_node *
+__eb64_insert(struct eb_root *root, struct eb64_node *new) {
+	struct eb64_node *old;
+	unsigned int side;
+	eb_troot_t *troot;
+	u64 newkey; /* caching the key saves approximately one cycle */
+	eb_troot_t *root_right;
+	int old_node_bit;
+
+	side = EB_LEFT;
+	troot = root->b[EB_LEFT];
+	root_right = root->b[EB_RGHT];
+	if (unlikely(troot == NULL)) {
+		/* Tree is empty, insert the leaf part below the left branch */
+		root->b[EB_LEFT] = eb_dotag(&new->node.branches, EB_LEAF);
+		new->node.leaf_p = eb_dotag(root, EB_LEFT);
+		new->node.node_p = NULL; /* node part unused */
+		return new;
+	}
+
+	/* The tree descent is fairly easy :
+	 *  - first, check if we have reached a leaf node
+	 *  - second, check if we have gone too far
+	 *  - third, reiterate
+	 * Everywhere, we use <new> for the node node we are inserting, <root>
+	 * for the node we attach it to, and <old> for the node we are
+	 * displacing below <new>. <troot> will always point to the future node
+	 * (tagged with its type). <side> carries the side the node <new> is
+	 * attached to below its parent, which is also where previous node
+	 * was attached. <newkey> carries the key being inserted.
+	 */
+	newkey = new->key;
+
+	while (1) {
+		if (unlikely(eb_gettag(troot) == EB_LEAF)) {
+			eb_troot_t *new_left, *new_rght;
+			eb_troot_t *new_leaf, *old_leaf;
+
+			old = container_of(eb_untag(troot, EB_LEAF),
+					    struct eb64_node, node.branches);
+
+			new_left = eb_dotag(&new->node.branches, EB_LEFT);
+			new_rght = eb_dotag(&new->node.branches, EB_RGHT);
+			new_leaf = eb_dotag(&new->node.branches, EB_LEAF);
+			old_leaf = eb_dotag(&old->node.branches, EB_LEAF);
+
+			new->node.node_p = old->node.leaf_p;
+
+			/* Right here, we have 3 possibilities :
+			   - the tree does not contain the key, and we have
+			     new->key < old->key. We insert new above old, on
+			     the left ;
+
+			   - the tree does not contain the key, and we have
+			     new->key > old->key. We insert new above old, on
+			     the right ;
+
+			   - the tree does contain the key, which implies it
+			     is alone. We add the new key next to it as a
+			     first duplicate.
+
+			   The last two cases can easily be partially merged.
+			*/
+			 
+			if (new->key < old->key) {
+				new->node.leaf_p = new_left;
+				old->node.leaf_p = new_rght;
+				new->node.branches.b[EB_LEFT] = new_leaf;
+				new->node.branches.b[EB_RGHT] = old_leaf;
+			} else {
+				/* we may refuse to duplicate this key if the tree is
+				 * tagged as containing only unique keys.
+				 */
+				if ((new->key == old->key) && eb_gettag(root_right))
+					return old;
+
+				/* new->key >= old->key, new goes the right */
+				old->node.leaf_p = new_left;
+				new->node.leaf_p = new_rght;
+				new->node.branches.b[EB_LEFT] = old_leaf;
+				new->node.branches.b[EB_RGHT] = new_leaf;
+
+				if (new->key == old->key) {
+					new->node.bit = -1;
+					root->b[side] = eb_dotag(&new->node.branches, EB_NODE);
+					return new;
+				}
+			}
+			break;
+		}
+
+		/* OK we're walking down this link */
+		old = container_of(eb_untag(troot, EB_NODE),
+				    struct eb64_node, node.branches);
+		old_node_bit = old->node.bit;
+
+		/* Stop going down when we don't have common bits anymore. We
+		 * also stop in front of a duplicates tree because it means we
+		 * have to insert above.
+		 */
+
+		if ((old_node_bit < 0) || /* we're above a duplicate tree, stop here */
+		    (((new->key ^ old->key) >> old_node_bit) >= EB_NODE_BRANCHES)) {
+			/* The tree did not contain the key, so we insert <new> before the node
+			 * <old>, and set ->bit to designate the lowest bit position in <new>
+			 * which applies to ->branches.b[].
+			 */
+			eb_troot_t *new_left, *new_rght;
+			eb_troot_t *new_leaf, *old_node;
+
+			new_left = eb_dotag(&new->node.branches, EB_LEFT);
+			new_rght = eb_dotag(&new->node.branches, EB_RGHT);
+			new_leaf = eb_dotag(&new->node.branches, EB_LEAF);
+			old_node = eb_dotag(&old->node.branches, EB_NODE);
+
+			new->node.node_p = old->node.node_p;
+
+			if (new->key < old->key) {
+				new->node.leaf_p = new_left;
+				old->node.node_p = new_rght;
+				new->node.branches.b[EB_LEFT] = new_leaf;
+				new->node.branches.b[EB_RGHT] = old_node;
+			}
+			else if (new->key > old->key) {
+				old->node.node_p = new_left;
+				new->node.leaf_p = new_rght;
+				new->node.branches.b[EB_LEFT] = old_node;
+				new->node.branches.b[EB_RGHT] = new_leaf;
+			}
+			else {
+				struct eb_node *ret;
+				ret = eb_insert_dup(&old->node, &new->node);
+				return container_of(ret, struct eb64_node, node);
+			}
+			break;
+		}
+
+		/* walk down */
+		root = &old->node.branches;
+
+		if (sizeof(long) >= 8) {
+			side = newkey >> old_node_bit;
+		} else {
+			/* note: provides the best code on low-register count archs
+			 * such as i386.
+			 */
+			side = newkey;
+			side >>= old_node_bit;
+			if (old_node_bit >= 32) {
+				side = newkey >> 32;
+				side >>= old_node_bit & 0x1F;
+			}
+		}
+		side &= EB_NODE_BRANCH_MASK;
+		troot = root->b[side];
+	}
+
+	/* Ok, now we are inserting <new> between <root> and <old>. <old>'s
+	 * parent is already set to <new>, and the <root>'s branch is still in
+	 * <side>. Update the root's leaf till we have it. Note that we can also
+	 * find the side by checking the side of new->node.node_p.
+	 */
+
+	/* We need the common higher bits between new->key and old->key.
+	 * What differences are there between new->key and the node here ?
+	 * NOTE that bit(new) is always < bit(root) because highest
+	 * bit of new->key and old->key are identical here (otherwise they
+	 * would sit on different branches).
+	 */
+	/* note that if EB_NODE_BITS > 1, we should check that it's still >= 0 */
+	new->node.bit = fls64(new->key ^ old->key) - EB_NODE_BITS;
+	root->b[side] = eb_dotag(&new->node.branches, EB_NODE);
+
+	return new;
+}
+
+#endif /* _EB64_TREE_H */
diff --git a/lib/ebtree.c b/lib/ebtree.c
new file mode 100755
index 000000000..c9f9953dd
--- /dev/null
+++ b/lib/ebtree.c
@@ -0,0 +1,40 @@
+/*
+ * Elastic Binary Trees - exported generic functions
+ *
+ * Copyright (C) 2000-2015 Willy Tarreau - w@1wt.eu
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "ebtree.h"
+
+void eb_delete(struct eb_node *node)
+{
+	__eb_delete(node);
+}
+EXPORT_SYMBOL(eb_delete);
+
+/* used by insertion primitives */
+struct eb_node *eb_insert_dup(struct eb_node *sub, struct eb_node *new)
+{
+	return __eb_insert_dup(sub, new);
+}
+EXPORT_SYMBOL(eb_insert_dup);
diff --git a/lib/ebtree.h b/lib/ebtree.h
new file mode 100755
index 000000000..affddd145
--- /dev/null
+++ b/lib/ebtree.h
@@ -0,0 +1,597 @@
+/*
+ * Elastic Binary Trees - generic macros and structures.
+ *
+ * Copyright (C) 2000-2015 Willy Tarreau - w@1wt.eu
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+
+/*
+  General idea:
+  -------------
+  In a radix binary tree, we may have up to 2N-1 nodes for N keys if all of
+  them are leaves. If we find a way to differentiate intermediate nodes (later
+  called "nodes") and final nodes (later called "leaves"), and we associate
+  them by two, it is possible to build sort of a self-contained radix tree with
+  intermediate nodes always present. It will not be as cheap as the ultree for
+  optimal cases as shown below, but the optimal case almost never happens :
+
+  Eg, to store 8, 10, 12, 13, 14 :
+
+             ultree          this theorical tree
+
+               8                   8
+              / \                 / \
+             10 12               10 12
+               /  \                /  \
+              13  14              12  14
+                                 / \
+                                12 13
+
+   Note that on real-world tests (with a scheduler), is was verified that the
+   case with data on an intermediate node never happens. This is because the
+   data spectrum is too large for such coincidences to happen. It would require
+   for instance that a task has its expiration time at an exact second, with
+   other tasks sharing that second. This is too rare to try to optimize for it.
+
+   What is interesting is that the node will only be added above the leaf when
+   necessary, which implies that it will always remain somewhere above it. So
+   both the leaf and the node can share the exact value of the leaf, because
+   when going down the node, the bit mask will be applied to comparisons. So we
+   are tempted to have one single key shared between the node and the leaf.
+
+   The bit only serves the nodes, and the dups only serve the leaves. So we can
+   put a lot of information in common. This results in one single entity with
+   two branch pointers and two parent pointers, one for the node part, and one
+   for the leaf part :
+
+              node's         leaf's
+              parent         parent
+                |              |
+              [node]         [leaf]
+               / \
+           left   right
+         branch   branch
+
+   The node may very well refer to its leaf counterpart in one of its branches,
+   indicating that its own leaf is just below it :
+
+              node's
+              parent
+                |
+              [node]
+               / \
+           left  [leaf]
+         branch
+
+   Adding keys in such a tree simply consists in inserting nodes between
+   other nodes and/or leaves :
+
+                [root]
+                  |
+               [node2]
+                 / \
+          [leaf1]   [node3]
+                      / \
+               [leaf2]   [leaf3]
+
+   On this diagram, we notice that [node2] and [leaf2] have been pulled away
+   from each other due to the insertion of [node3], just as if there would be
+   an elastic between both parts. This elastic-like behaviour gave its name to
+   the tree : "Elastic Binary Tree", or "EBtree". The entity which associates a
+   node part and a leaf part will be called an "EB node".
+
+   We also notice on the diagram that there is a root entity required to attach
+   the tree. It only contains two branches and there is nothing above it. This
+   is an "EB root". Some will note that [leaf1] has no [node1]. One property of
+   the EBtree is that all nodes have their branches filled, and that if a node
+   has only one branch, it does not need to exist. Here, [leaf1] was added
+   below [root] and did not need any node.
+
+   An EB node contains :
+     - a pointer to the node's parent (node_p)
+     - a pointer to the leaf's parent (leaf_p)
+     - two branches pointing to lower nodes or leaves (branches)
+     - a bit position (bit)
+     - an optional key.
+
+   The key here is optional because it's used only during insertion, in order
+   to classify the nodes. Nothing else in the tree structure requires knowledge
+   of the key. This makes it possible to write type-agnostic primitives for
+   everything, and type-specific insertion primitives. This has led to consider
+   two types of EB nodes. The type-agnostic ones will serve as a header for the
+   other ones, and will simply be called "struct eb_node". The other ones will
+   have their type indicated in the structure name. Eg: "struct eb32_node" for
+   nodes carrying 32 bit keys.
+
+   We will also node that the two branches in a node serve exactly the same
+   purpose as an EB root. For this reason, a "struct eb_root" will be used as
+   well inside the struct eb_node. In order to ease pointer manipulation and
+   ROOT detection when walking upwards, all the pointers inside an eb_node will
+   point to the eb_root part of the referenced EB nodes, relying on the same
+   principle as the linked lists in Linux.
+
+   Another important point to note, is that when walking inside a tree, it is
+   very convenient to know where a node is attached in its parent, and what
+   type of branch it has below it (leaf or node). In order to simplify the
+   operations and to speed up the processing, it was decided in this specific
+   implementation to use the lowest bit from the pointer to designate the side
+   of the upper pointers (left/right) and the type of a branch (leaf/node).
+   This practise is not mandatory by design, but an implementation-specific
+   optimisation permitted on all platforms on which data must be aligned. All
+   known 32 bit platforms align their integers and pointers to 32 bits, leaving
+   the two lower bits unused. So, we say that the pointers are "tagged". And
+   since they designate pointers to root parts, we simply call them
+   "tagged root pointers", or "eb_troot" in the code.
+
+   Duplicate keys are stored in a special manner. When inserting a key, if
+   the same one is found, then an incremental binary tree is built at this
+   place from these keys. This ensures that no special case has to be written
+   to handle duplicates when walking through the tree or when deleting entries.
+   It also guarantees that duplicates will be walked in the exact same order
+   they were inserted. This is very important when trying to achieve fair
+   processing distribution for instance.
+
+   Algorithmic complexity can be derived from 3 variables :
+     - the number of possible different keys in the tree : P
+     - the number of entries in the tree : N
+     - the number of duplicates for one key : D
+
+   Note that this tree is deliberately NOT balanced. For this reason, the worst
+   case may happen with a small tree (eg: 32 distinct keys of one bit). BUT,
+   the operations required to manage such data are so much cheap that they make
+   it worth using it even under such conditions. For instance, a balanced tree
+   may require only 6 levels to store those 32 keys when this tree will
+   require 32. But if per-level operations are 5 times cheaper, it wins.
+
+   Minimal, Maximal and Average times are specified in number of operations.
+   Minimal is given for best condition, Maximal for worst condition, and the
+   average is reported for a tree containing random keys. An operation
+   generally consists in jumping from one node to the other.
+
+   Complexity :
+     - lookup              : min=1, max=log(P), avg=log(N)
+     - insertion from root : min=1, max=log(P), avg=log(N)
+     - insertion of dups   : min=1, max=log(D), avg=log(D)/2 after lookup
+     - deletion            : min=1, max=1,      avg=1
+     - prev/next           : min=1, max=log(P), avg=2 :
+       N/2 nodes need 1 hop  => 1*N/2
+       N/4 nodes need 2 hops => 2*N/4
+       N/8 nodes need 3 hops => 3*N/8
+       ...
+       N/x nodes need log(x) hops => log2(x)*N/x
+       Total cost for all N nodes : sum[i=1..N](log2(i)*N/i) = N*sum[i=1..N](log2(i)/i)
+       Average cost across N nodes = total / N = sum[i=1..N](log2(i)/i) = 2
+
+   This design is currently limited to only two branches per node. Most of the
+   tree descent algorithm would be compatible with more branches (eg: 4, to cut
+   the height in half), but this would probably require more complex operations
+   and the deletion algorithm would be problematic.
+
+   Useful properties :
+     - a node is always added above the leaf it is tied to, and never can get
+       below nor in another branch. This implies that leaves directly attached
+       to the root do not use their node part, which is indicated by a NULL
+       value in node_p. This also enhances the cache efficiency when walking
+       down the tree, because when the leaf is reached, its node part will
+       already have been visited (unless it's the first leaf in the tree).
+
+     - pointers to lower nodes or leaves are stored in "branch" pointers. Only
+       the root node may have a NULL in either branch, it is not possible for
+       other branches. Since the nodes are attached to the left branch of the
+       root, it is not possible to see a NULL left branch when walking up a
+       tree. Thus, an empty tree is immediately identified by a NULL left
+       branch at the root. Conversely, the one and only way to identify the
+       root node is to check that it right branch is NULL. Note that the
+       NULL pointer may have a few low-order bits set.
+
+     - a node connected to its own leaf will have branch[0|1] pointing to
+       itself, and leaf_p pointing to itself.
+
+     - a node can never have node_p pointing to itself.
+
+     - a node is linked in a tree if and only if it has a non-null leaf_p.
+
+     - a node can never have both branches equal, except for the root which can
+       have them both NULL.
+
+     - deletion only applies to leaves. When a leaf is deleted, its parent must
+       be released too (unless it's the root), and its sibling must attach to
+       the grand-parent, replacing the parent. Also, when a leaf is deleted,
+       the node tied to this leaf will be removed and must be released too. If
+       this node is different from the leaf's parent, the freshly released
+       leaf's parent will be used to replace the node which must go. A released
+       node will never be used anymore, so there's no point in tracking it.
+
+     - the bit index in a node indicates the bit position in the key which is
+       represented by the branches. That means that a node with (bit == 0) is
+       just above two leaves. Negative bit values are used to build a duplicate
+       tree. The first node above two identical leaves gets (bit == -1). This
+       value logarithmically decreases as the duplicate tree grows. During
+       duplicate insertion, a node is inserted above the highest bit value (the
+       lowest absolute value) in the tree during the right-sided walk. If bit
+       -1 is not encountered (highest < -1), we insert above last leaf.
+       Otherwise, we insert above the node with the highest value which was not
+       equal to the one of its parent + 1.
+
+     - the "eb_next" primitive walks from left to right, which means from lower
+       to higher keys. It returns duplicates in the order they were inserted.
+       The "eb_first" primitive returns the left-most entry.
+
+     - the "eb_prev" primitive walks from right to left, which means from
+       higher to lower keys. It returns duplicates in the opposite order they
+       were inserted. The "eb_last" primitive returns the right-most entry.
+
+     - a tree which has 1 in the lower bit of its root's right branch is a
+       tree with unique nodes. This means that when a node is inserted with
+       a key which already exists will not be inserted, and the previous
+       entry will be returned.
+
+ */
+#ifndef __EBTREE_H__
+#define __EBTREE_H__
+
+#include <linux/kernel.h>
+
+/* By default, gcc does not inline large chunks of code, but we want it to
+ * respect our choices.
+ */
+#if !defined(forceinline)
+#if __GNUC__ < 3
+#define forceinline inline
+#else
+#define forceinline inline __attribute__((always_inline))
+#endif
+#endif
+
+/* sets alignment for current field or variable */
+#ifndef ALIGNED
+#define ALIGNED(x) __attribute__((aligned(x)))
+#endif
+
+/* add a mandatory alignment for next fields in a structure */
+#ifndef ALWAYS_ALIGN
+#define ALWAYS_ALIGN(x)  union { } ALIGNED(x)
+#endif
+
+/* add an optional alignment for next fields in a structure, only for archs
+ * which do not support unaligned accesses.
+ */
+#ifndef MAYBE_ALIGN
+#define MAYBE_ALIGN(x)  union { } ALIGNED(x)
+#else
+#define MAYBE_ALIGN(x)
+#endif
+
+/* Number of bits per node, and number of leaves per node */
+#define EB_NODE_BITS          1
+#define EB_NODE_BRANCHES      (1 << EB_NODE_BITS)
+#define EB_NODE_BRANCH_MASK   (EB_NODE_BRANCHES - 1)
+
+/* Be careful not to tweak those values. The walking code is optimized for NULL
+ * detection on the assumption that the following values are intact.
+ */
+#define EB_LEFT     0
+#define EB_RGHT     1
+#define EB_LEAF     0
+#define EB_NODE     1
+
+/* Tags to set in root->b[EB_RGHT] :
+ * - EB_NORMAL is a normal tree which stores duplicate keys.
+ * - EB_UNIQUE is a tree which stores unique keys.
+ */
+#define EB_NORMAL   0
+#define EB_UNIQUE   1
+
+/* This is the same as an eb_node pointer, except that the lower bit embeds
+ * a tag. See eb_dotag()/eb_untag()/eb_gettag(). This tag has two meanings :
+ *  - 0=left, 1=right to designate the parent's branch for leaf_p/node_p
+ *  - 0=link, 1=leaf  to designate the branch's type for branch[]
+ */
+typedef void eb_troot_t;
+
+/* The eb_root connects the node which contains it, to two nodes below it, one
+ * of which may be the same node. At the top of the tree, we use an eb_root
+ * too, which always has its right branch NULL (+/1 low-order bits).
+ */
+struct eb_root {
+	eb_troot_t    *b[EB_NODE_BRANCHES]; /* left and right branches */
+};
+
+/* The eb_node contains the two parts, one for the leaf, which always exists,
+ * and one for the node, which remains unused in the very first node inserted
+ * into the tree. This structure is 20 bytes per node on 32-bit machines. Do
+ * not change the order, benchmarks have shown that it's optimal this way.
+ * Note: be careful about this struct's alignment if it gets included into
+ * another struct and some atomic ops are expected on the keys or the node.
+ */
+struct eb_node {
+	struct eb_root branches; /* branches, must be at the beginning */
+	eb_troot_t    *node_p;  /* link node's parent */
+	eb_troot_t    *leaf_p;  /* leaf node's parent */
+	short int      bit;     /* link's bit position. */
+	short unsigned int pfx; /* data prefix length, always related to leaf */
+} __attribute__((packed));
+
+/* Return the structure of type <type> whose member <member> points to <ptr> */
+#define eb_entry(ptr, type, member) container_of(ptr, type, member)
+
+/* The root of a tree is an eb_root initialized with both pointers NULL.
+ * During its life, only the left pointer will change. The right one will
+ * always remain NULL, which is the way we detect it.
+ */
+#define EB_ROOT						\
+	(struct eb_root) {				\
+		.b = {[0] = NULL, [1] = NULL },		\
+	}
+
+#define EB_ROOT_UNIQUE					\
+	(struct eb_root) {				\
+		.b = {[0] = NULL, [1] = (void *)1 },	\
+	}
+
+#define EB_TREE_HEAD(name)				\
+	struct eb_root name = EB_ROOT
+
+
+/***************************************\
+ * Private functions. Not for end-user *
+\***************************************/
+
+/* Converts a root pointer to its equivalent eb_troot_t pointer,
+ * ready to be stored in ->branch[], leaf_p or node_p. NULL is not
+ * conserved. To be used with EB_LEAF, EB_NODE, EB_LEFT or EB_RGHT in <tag>.
+ */
+static inline eb_troot_t *eb_dotag(const struct eb_root *root, const int tag)
+{
+	return (eb_troot_t *)((char *)root + tag);
+}
+
+/* Converts an eb_troot_t pointer pointer to its equivalent eb_root pointer,
+ * for use with pointers from ->branch[], leaf_p or node_p. NULL is conserved
+ * as long as the tree is not corrupted. To be used with EB_LEAF, EB_NODE,
+ * EB_LEFT or EB_RGHT in <tag>.
+ */
+static inline struct eb_root *eb_untag(const eb_troot_t *troot, const int tag)
+{
+	return (struct eb_root *)((char *)troot - tag);
+}
+
+/* returns the tag associated with an eb_troot_t pointer */
+static inline int eb_gettag(eb_troot_t *troot)
+{
+	return (unsigned long)troot & 1;
+}
+
+/* Converts a root pointer to its equivalent eb_troot_t pointer and clears the
+ * tag, no matter what its value was.
+ */
+static inline struct eb_root *eb_clrtag(const eb_troot_t *troot)
+{
+	return (struct eb_root *)((unsigned long)troot & ~1UL);
+}
+
+/* Returns a pointer to the eb_node holding <root> */
+static inline struct eb_node *eb_root_to_node(struct eb_root *root)
+{
+	return container_of(root, struct eb_node, branches);
+}
+
+/* Walks down starting at root pointer <start>, and always walking on side
+ * <side>. It either returns the node hosting the first leaf on that side,
+ * or NULL if no leaf is found. <start> may either be NULL or a branch pointer.
+ * The pointer to the leaf (or NULL) is returned.
+ */
+static inline struct eb_node *eb_walk_down(eb_troot_t *start, unsigned int side)
+{
+	/* A NULL pointer on an empty tree root will be returned as-is */
+	while (eb_gettag(start) == EB_NODE)
+		start = (eb_untag(start, EB_NODE))->b[side];
+	/* NULL is left untouched (root==eb_node, EB_LEAF==0) */
+	return eb_root_to_node(eb_untag(start, EB_LEAF));
+}
+
+/* This function is used to build a tree of duplicates by adding a new node to
+ * a subtree of at least 2 entries. It will probably never be needed inlined,
+ * and it is not for end-user.
+ */
+static forceinline struct eb_node *
+__eb_insert_dup(struct eb_node *sub, struct eb_node *new)
+{
+	struct eb_node *head = sub;
+	
+	eb_troot_t *new_left = eb_dotag(&new->branches, EB_LEFT);
+	eb_troot_t *new_rght = eb_dotag(&new->branches, EB_RGHT);
+	eb_troot_t *new_leaf = eb_dotag(&new->branches, EB_LEAF);
+
+	/* first, identify the deepest hole on the right branch */
+	while (eb_gettag(head->branches.b[EB_RGHT]) != EB_LEAF) {
+		struct eb_node *last = head;
+		head = container_of(eb_untag(head->branches.b[EB_RGHT], EB_NODE),
+				    struct eb_node, branches);
+		if (head->bit > last->bit + 1)
+			sub = head;     /* there's a hole here */
+	}
+
+	/* Here we have a leaf attached to (head)->b[EB_RGHT] */
+	if (head->bit < -1) {
+		/* A hole exists just before the leaf, we insert there */
+		new->bit = -1;
+		sub = container_of(eb_untag(head->branches.b[EB_RGHT], EB_LEAF),
+				   struct eb_node, branches);
+		head->branches.b[EB_RGHT] = eb_dotag(&new->branches, EB_NODE);
+
+		new->node_p = sub->leaf_p;
+		new->leaf_p = new_rght;
+		sub->leaf_p = new_left;
+		new->branches.b[EB_LEFT] = eb_dotag(&sub->branches, EB_LEAF);
+		new->branches.b[EB_RGHT] = new_leaf;
+		return new;
+	} else {
+		int side;
+		/* No hole was found before a leaf. We have to insert above
+		 * <sub>. Note that we cannot be certain that <sub> is attached
+		 * to the right of its parent, as this is only true if <sub>
+		 * is inside the dup tree, not at the head.
+		 */
+		new->bit = sub->bit - 1; /* install at the lowest level */
+		side = eb_gettag(sub->node_p);
+		head = container_of(eb_untag(sub->node_p, side), struct eb_node, branches);
+		head->branches.b[side] = eb_dotag(&new->branches, EB_NODE);
+					
+		new->node_p = sub->node_p;
+		new->leaf_p = new_rght;
+		sub->node_p = new_left;
+		new->branches.b[EB_LEFT] = eb_dotag(&sub->branches, EB_NODE);
+		new->branches.b[EB_RGHT] = new_leaf;
+		return new;
+	}
+}
+
+
+/**************************************\
+ * Public functions, for the end-user *
+\**************************************/
+
+/* Return non-zero if the tree is empty, otherwise zero */
+static inline int eb_is_empty(struct eb_root *root)
+{
+	return !root->b[EB_LEFT];
+}
+
+/* Return the first leaf in the tree starting at <root>, or NULL if none */
+static inline struct eb_node *eb_first(struct eb_root *root)
+{
+	return eb_walk_down(root->b[0], EB_LEFT);
+}
+
+/* Return the last leaf in the tree starting at <root>, or NULL if none */
+static inline struct eb_node *eb_last(struct eb_root *root)
+{
+	return eb_walk_down(root->b[0], EB_RGHT);
+}
+
+/* Removes a leaf node from the tree if it was still in it. Marks the node
+ * as unlinked.
+ */
+static forceinline void __eb_delete(struct eb_node *node)
+{
+	unsigned int pside, gpside, sibtype;
+	struct eb_node *parent;
+	struct eb_root *gparent;
+
+	if (!node->leaf_p)
+		return;
+
+	/* we need the parent, our side, and the grand parent */
+	pside = eb_gettag(node->leaf_p);
+	parent = eb_root_to_node(eb_untag(node->leaf_p, pside));
+
+	/* We likely have to release the parent link, unless it's the root,
+	 * in which case we only set our branch to NULL. Note that we can
+	 * only be attached to the root by its left branch.
+	 */
+
+	if (eb_clrtag(parent->branches.b[EB_RGHT]) == NULL) {
+		/* we're just below the root, it's trivial. */
+		parent->branches.b[EB_LEFT] = NULL;
+		goto delete_unlink;
+	}
+
+	/* To release our parent, we have to identify our sibling, and reparent
+	 * it directly to/from the grand parent. Note that the sibling can
+	 * either be a link or a leaf.
+	 */
+
+	gpside = eb_gettag(parent->node_p);
+	gparent = eb_untag(parent->node_p, gpside);
+
+	gparent->b[gpside] = parent->branches.b[!pside];
+	sibtype = eb_gettag(gparent->b[gpside]);
+
+	if (sibtype == EB_LEAF) {
+		eb_root_to_node(eb_untag(gparent->b[gpside], EB_LEAF))->leaf_p =
+			eb_dotag(gparent, gpside);
+	} else {
+		eb_root_to_node(eb_untag(gparent->b[gpside], EB_NODE))->node_p =
+			eb_dotag(gparent, gpside);
+	}
+	/* Mark the parent unused. Note that we do not check if the parent is
+	 * our own node, but that's not a problem because if it is, it will be
+	 * marked unused at the same time, which we'll use below to know we can
+	 * safely remove it.
+	 */
+	parent->node_p = NULL;
+
+	/* The parent node has been detached, and is currently unused. It may
+	 * belong to another node, so we cannot remove it that way. Also, our
+	 * own node part might still be used. so we can use this spare node
+	 * to replace ours if needed.
+	 */
+
+	/* If our link part is unused, we can safely exit now */
+	if (!node->node_p)
+		goto delete_unlink;
+
+	/* From now on, <node> and <parent> are necessarily different, and the
+	 * <node>'s node part is in use. By definition, <parent> is at least
+	 * below <node>, so keeping its key for the bit string is OK.
+	 */
+
+	parent->node_p = node->node_p;
+	parent->branches = node->branches;
+	parent->bit = node->bit;
+
+	/* We must now update the new node's parent... */
+	gpside = eb_gettag(parent->node_p);
+	gparent = eb_untag(parent->node_p, gpside);
+	gparent->b[gpside] = eb_dotag(&parent->branches, EB_NODE);
+
+	/* ... and its branches */
+	for (pside = 0; pside <= 1; pside++) {
+		if (eb_gettag(parent->branches.b[pside]) == EB_NODE) {
+			eb_root_to_node(eb_untag(parent->branches.b[pside], EB_NODE))->node_p =
+				eb_dotag(&parent->branches, pside);
+		} else {
+			eb_root_to_node(eb_untag(parent->branches.b[pside], EB_LEAF))->leaf_p =
+				eb_dotag(&parent->branches, pside);
+		}
+	}
+ delete_unlink:
+	/* Now the node has been completely unlinked */
+	node->leaf_p = NULL;
+	return; /* tree is not empty yet */
+}
+
+/* These functions are declared in ebtree.c */
+void eb_delete(struct eb_node *node);
+struct eb_node *eb_insert_dup(struct eb_node *sub, struct eb_node *new);
+
+#endif /* __EBTREE_H__ */
+
+/*
+ * Local variables:
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ * End:
+ */
diff --git a/linux-5.10.35.patch b/linux-5.10.35.patch
index 32140c15b..e05fec726 100644
--- a/linux-5.10.35.patch
+++ b/linux-5.10.35.patch
@@ -535,7 +535,7 @@ index e37480b5f..8236d5929 100644
  
  /*
 diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
-index a828cf99c..ea837130d 100644
+index a828cf99c..5d416997a 100644
 --- a/include/linux/skbuff.h
 +++ b/include/linux/skbuff.h
 @@ -232,6 +232,12 @@
@@ -551,22 +551,20 @@ index a828cf99c..ea837130d 100644
  
  /* return minimum truesize of one skb containing X bytes of data */
  #define SKB_TRUESIZE(X) ((X) +						\
-@@ -724,6 +730,14 @@ struct sk_buff {
+@@ -724,6 +730,12 @@ struct sk_buff {
  				 * UDP receive path is one user.
  				 */
  				unsigned long		dev_scratch;
 +#ifdef CONFIG_SECURITY_TEMPESTA
-+                                struct {
-+                                        __u8    present : 1;
-+                                        __u8    tls_type : 7;
-+                                        __u16   flags : 16;
-+                                        unsigned int cb;
-+                                } tfw_cb;
++				struct {
++					__u8		present : 1;
++					__u8		tls_type : 7;
++				} tfw_cb;
 +#endif
  			};
  		};
  		struct rb_node		rbnode; /* used in netem, ip4 defrag, and tcp stack */
-@@ -784,9 +798,15 @@ struct sk_buff {
+@@ -784,9 +796,15 @@ struct sk_buff {
  				fclone:2,
  				peeked:1,
  				head_frag:1,
@@ -578,11 +576,11 @@ index a828cf99c..ea837130d 100644
  	__u8			active_extensions;
 +#endif
 +#ifdef CONFIG_SECURITY_TEMPESTA
-+        __u8                    tail_lock:1;
++	__u8			tail_lock:1;
  #endif
  	/* fields enclosed in headers_start/headers_end are copied
  	 * using a single memcpy() in __copy_skb_header()
-@@ -839,7 +859,6 @@ struct sk_buff {
+@@ -839,7 +857,6 @@ struct sk_buff {
  #ifdef CONFIG_IPV6_NDISC_NODETYPE
  	__u8			ndisc_nodetype:2;
  #endif
@@ -590,32 +588,11 @@ index a828cf99c..ea837130d 100644
  	__u8			ipvs_property:1;
  	__u8			inner_protocol_type:1;
  	__u8			remcsum_offload:1;
-@@ -931,6 +950,96 @@ struct sk_buff {
+@@ -931,6 +948,43 @@ struct sk_buff {
  #define SKB_ALLOC_RX		0x02
  #define SKB_ALLOC_NAPI		0x04
  
 +#ifdef CONFIG_SECURITY_TEMPESTA
-+enum {
-+	/* This skb contains start of http2 frame. */
-+	SS_F_HTTP2_FRAME_START                  = 0x01,
-+	/* This skb contains new hpack dynamic table size. */
-+	SS_F_HTTT2_HPACK_TBL_SZ_ENCODED         = 0x02,
-+	/* This skb contains headers frame. */
-+	SS_F_HTTT2_FRAME_HEADERS                = 0x04,
-+	/* This skb contains data frame. */
-+	SS_F_HTTT2_FRAME_DATA                   = 0x08,
-+	/* This skb was already prepared. */
-+	SS_F_HTTP2_FRAME_PREPARED               = 0x10,
-+	/* This skb acks new hpack dynamic tbl size. */
-+	SS_F_HTTP2_ACK_FOR_HPACK_TBL_RESIZING   = 0x20,
-+	/*
-+	 * These flags should be cleared when we copy flags
-+	 * from one skb to another one.
-+	 */
-+	TEMPESTA_SKB_FLAG_CLEAR_MASK	= SS_F_HTTP2_ACK_FOR_HPACK_TBL_RESIZING |
-+					  SS_F_HTTT2_HPACK_TBL_SZ_ENCODED |
-+					  SS_F_HTTP2_FRAME_START,
-+};
 +
 +static inline unsigned long
 +skb_tfw_is_present(struct sk_buff *skb)
@@ -626,9 +603,9 @@ index a828cf99c..ea837130d 100644
 +static inline void
 +skb_set_tfw_tls_type(struct sk_buff *skb, unsigned char tls_type)
 +{
-+        BUG_ON(tls_type > 0x7F);
-+        skb->tfw_cb.present = 1;
-+        skb->tfw_cb.tls_type = tls_type;
++	BUG_ON(tls_type > 0x7F);
++	skb->tfw_cb.present = 1;
++	skb->tfw_cb.tls_type = tls_type;
 +}
 +
 +static inline unsigned char
@@ -638,38 +615,6 @@ index a828cf99c..ea837130d 100644
 +}
 +
 +static inline void
-+skb_set_tfw_flags(struct sk_buff *skb, unsigned short flags)
-+{
-+        skb->tfw_cb.present = 1;
-+        skb->tfw_cb.flags |= flags;
-+}
-+
-+static inline void
-+skb_clear_tfw_flag(struct sk_buff *skb, unsigned short flag)
-+{
-+        skb->tfw_cb.flags &= ~flag;
-+}
-+
-+static inline unsigned short
-+skb_tfw_flags(struct sk_buff *skb)
-+{
-+        return skb->tfw_cb.present ? skb->tfw_cb.flags : 0;
-+}
-+
-+static inline void
-+skb_set_tfw_cb(struct sk_buff *skb, unsigned int cb)
-+{
-+        skb->tfw_cb.present = 1;
-+        skb->tfw_cb.cb = cb;
-+}
-+
-+static inline unsigned int
-+skb_tfw_cb(struct sk_buff *skb)
-+{
-+        return skb->tfw_cb.present ? skb->tfw_cb.cb : 0;
-+}
-+
-+static inline void
 +skb_copy_tfw_cb(struct sk_buff *dst, struct sk_buff *src)
 +{
 +	dst->dev = src->dev;
@@ -687,7 +632,7 @@ index a828cf99c..ea837130d 100644
  /**
   * skb_pfmemalloc - Test if the skb was allocated from PFMEMALLOC reserves
   * @skb: buffer
-@@ -1074,6 +1183,7 @@ void kfree_skb_partial(struct sk_buff *skb, bool head_stolen);
+@@ -1074,6 +1128,7 @@ void kfree_skb_partial(struct sk_buff *skb, bool head_stolen);
  bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
  		      bool *fragstolen, int *delta_truesize);
  
@@ -695,7 +640,7 @@ index a828cf99c..ea837130d 100644
  struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags,
  			    int node);
  struct sk_buff *__build_skb(void *data, unsigned int frag_size);
-@@ -2104,7 +2214,11 @@ struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list);
+@@ -2104,7 +2159,11 @@ struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list);
  
  static inline bool skb_is_nonlinear(const struct sk_buff *skb)
  {
@@ -707,7 +652,7 @@ index a828cf99c..ea837130d 100644
  }
  
  static inline unsigned int skb_headlen(const struct sk_buff *skb)
-@@ -2341,6 +2455,20 @@ static inline unsigned int skb_headroom(const struct sk_buff *skb)
+@@ -2341,6 +2400,20 @@ static inline unsigned int skb_headroom(const struct sk_buff *skb)
  	return skb->data - skb->head;
  }
  
@@ -807,40 +752,53 @@ index 89163ef8c..49ad1ddc9 100644
  	union {
  		struct ip_options_rcu __rcu	*ireq_opt;
 diff --git a/include/net/sock.h b/include/net/sock.h
-index 261195598..456f6bd50 100644
+index 261195598..6b7910c55 100644
 --- a/include/net/sock.h
 +++ b/include/net/sock.h
-@@ -506,6 +506,19 @@ struct sock {
+@@ -506,6 +506,31 @@ struct sock {
  	void			(*sk_state_change)(struct sock *sk);
  	void			(*sk_data_ready)(struct sock *sk);
  	void			(*sk_write_space)(struct sock *sk);
 +#ifdef CONFIG_SECURITY_TEMPESTA
-+	int			(*sk_prepare_xmit)(struct sock *sk,
-+						   struct sk_buff *skb,
-+						   unsigned int mss_now,
-+						   unsigned int *limit,
-+						   unsigned int *skbs);
++				/*
++				 * Tempesta FW callback to ecrypt one
++				 * or more skb in socket write queue
++				 * before sending.
++				 */
 +	int			(*sk_write_xmit)(struct sock *sk,
 +						 struct sk_buff *skb,
 +						 unsigned int mss_now,
-+						 unsigned int limit,
-+						 unsigned int skbs);
++						 unsigned int limit);
++				/*
++				 * Tempesta FW callback to prepare and push
++				 * skbs from Tempesta FW private scheduler
++				 * to socket write queue according sender
++				 * and receiver window.
++				 */
++	int			(*sk_fill_write_queue)(struct sock *sk,
++						       unsigned int mss_now,
++						       int ss_action);
++				/*
++				 * Tempesta FW callback to free all private
++				 * resources associated with socket.
++				 */
 +	void			(*sk_destroy_cb)(struct sock *sk);
 +#endif
  	void			(*sk_error_report)(struct sock *sk);
  	int			(*sk_backlog_rcv)(struct sock *sk,
  						  struct sk_buff *skb);
-@@ -861,6 +874,9 @@ enum sock_flags {
+@@ -861,6 +886,10 @@ enum sock_flags {
  	SOCK_TXTIME,
  	SOCK_XDP, /* XDP is attached */
  	SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */
 +#ifdef CONFIG_SECURITY_TEMPESTA
 +	SOCK_TEMPESTA, /* The socket is managed by Tempesta FW */
++	SOCK_TEMPESTA_HAS_DATA /* The socket has data in Tempesta FW write queue */
 +#endif
  };
  
  #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
-@@ -1081,6 +1097,16 @@ static inline void sock_rps_reset_rxhash(struct sock *sk)
+@@ -1081,6 +1110,16 @@ static inline void sock_rps_reset_rxhash(struct sock *sk)
  		__rc;							\
  	})
  
@@ -857,7 +815,7 @@ index 261195598..456f6bd50 100644
  int sk_stream_wait_connect(struct sock *sk, long *timeo_p);
  int sk_stream_wait_memory(struct sock *sk, long *timeo_p);
  void sk_stream_wait_close(struct sock *sk, long timeo_p);
-@@ -1915,8 +1941,7 @@ static inline bool sk_rethink_txhash(struct sock *sk)
+@@ -1915,8 +1954,7 @@ static inline bool sk_rethink_txhash(struct sock *sk)
  static inline struct dst_entry *
  __sk_dst_get(struct sock *sk)
  {
@@ -868,7 +826,7 @@ index 261195598..456f6bd50 100644
  
  static inline struct dst_entry *
 diff --git a/include/net/tcp.h b/include/net/tcp.h
-index 7d66c61d2..8ec3cbbfb 100644
+index 7d66c61d2..7785fc8a6 100644
 --- a/include/net/tcp.h
 +++ b/include/net/tcp.h
 @@ -307,6 +307,7 @@ bool tcp_check_oom(struct sock *sk, int shift);
@@ -879,16 +837,7 @@ index 7d66c61d2..8ec3cbbfb 100644
  
  #define TCP_INC_STATS(net, field)	SNMP_INC_STATS((net)->mib.tcp_statistics, field)
  #define __TCP_INC_STATS(net, field)	__SNMP_INC_STATS((net)->mib.tcp_statistics, field)
-@@ -584,6 +585,8 @@ enum tcp_queue {
- 	TCP_FRAG_IN_WRITE_QUEUE,
- 	TCP_FRAG_IN_RTX_QUEUE,
- };
-+int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
-+		 unsigned int mss_now, gfp_t gfp);
- int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
- 		 struct sk_buff *skb, u32 len,
- 		 unsigned int mss_now, gfp_t gfp);
-@@ -653,6 +656,22 @@ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
+@@ -653,6 +654,22 @@ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
  /* tcp.c */
  void tcp_get_info(struct sock *, struct tcp_info *);
  
@@ -911,6 +860,58 @@ index 7d66c61d2..8ec3cbbfb 100644
  /* Read 'sendfile()'-style from a TCP socket */
  int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
  		  sk_read_actor_t recv_actor);
+@@ -1858,11 +1875,51 @@ static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct soc
+ 	sk_wmem_free_skb(sk, skb);
+ }
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++/**
++ * This function is similar to `tcp_write_err` except that we send
++ * TCP RST to remote peer.  We call this function when an error occurs
++ * while sending data from which we cannot recover, so we close the
++ * connection with TCP RST.
++ */
++static inline void
++tcp_tfw_handle_error(struct sock *sk, int error)
++{
++	tcp_send_active_reset(sk, GFP_ATOMIC);
++	sk->sk_err = error;
++	sk->sk_error_report(sk);
++	tcp_write_queue_purge(sk);
++	tcp_done(sk);
++}
++#endif
++
+ static inline void tcp_push_pending_frames(struct sock *sk)
+ {
++#ifdef CONFIG_SECURITY_TEMPESTA
++	unsigned int mss_now = 0;
++
++	if (sock_flag(sk, SOCK_TEMPESTA_HAS_DATA)
++	    && sk->sk_fill_write_queue)
++	{
++		int result;
++
++		mss_now = tcp_current_mss(sk);
++		result = sk->sk_fill_write_queue(sk, mss_now, 0);
++		if (unlikely(result < 0 && result != -ENOMEM)) {
++			tcp_tfw_handle_error(sk, result);
++			return;
++		}
++	}
++#endif
+ 	if (tcp_send_head(sk)) {
+ 		struct tcp_sock *tp = tcp_sk(sk);
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++		if (mss_now != 0) {
++			int nonagle = TCP_NAGLE_OFF | TCP_NAGLE_PUSH;
++			__tcp_push_pending_frames(sk, mss_now, nonagle);
++		} else
++#endif
+ 		__tcp_push_pending_frames(sk, tcp_current_mss(sk), tp->nonagle);
+ 	}
+ }
 diff --git a/include/net/tls.h b/include/net/tls.h
 index 2bdd80221..356850dda 100644
 --- a/include/net/tls.h
@@ -1408,7 +1409,7 @@ index f35c2e998..6ec40ac3c 100644
  }
 +EXPORT_SYMBOL(reqsk_fastopen_remove);
 diff --git a/net/core/skbuff.c b/net/core/skbuff.c
-index 1301ea694..02ff44569 100644
+index 1301ea694..42fc8a110 100644
 --- a/net/core/skbuff.c
 +++ b/net/core/skbuff.c
 @@ -80,7 +80,9 @@
@@ -1911,7 +1912,7 @@ index 1301ea694..02ff44569 100644
  {
 +#ifdef CONFIG_SECURITY_TEMPESTA
 +	int cpu, l;
-+	for_each_possible_cpu(cpu)
++	for_each_online_cpu(cpu)
 +		for (l = 0; l < PG_LISTS_N; ++l) {
 +			TfwSkbMemPool *pool = per_cpu_ptr(&pg_mpool[l], cpu);
 +			INIT_LIST_HEAD(&pool->lh);
@@ -2422,7 +2423,7 @@ index f0f67b25c..58fbfb071 100644
  		return NULL;
  	}
 diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
-index f99494637..6364d7c5f 100644
+index f99494637..14d28bcca 100644
 --- a/net/ipv4/tcp_output.c
 +++ b/net/ipv4/tcp_output.c
 @@ -39,6 +39,9 @@
@@ -2435,7 +2436,17 @@ index f99494637..6364d7c5f 100644
  
  #include <linux/compiler.h>
  #include <linux/gfp.h>
-@@ -389,7 +392,7 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
+@@ -155,6 +158,9 @@ void tcp_cwnd_restart(struct sock *sk, s32 delta)
+ 	tp->snd_cwnd_stamp = tcp_jiffies32;
+ 	tp->snd_cwnd_used = 0;
+ }
++#ifdef CONFIG_SECURITY_TEMPESTA
++EXPORT_SYMBOL(tcp_cwnd_restart);
++#endif
+ 
+ /* Congestion state accounting after a packet has been sent. */
+ static void tcp_event_data_sent(struct tcp_sock *tp,
+@@ -389,7 +395,7 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
  /* Constructs common control bits of non-data skb. If SYN/FIN is present,
   * auto increment end seqno.
   */
@@ -2444,7 +2455,7 @@ index f99494637..6364d7c5f 100644
  {
  	skb->ip_summed = CHECKSUM_PARTIAL;
  
-@@ -403,6 +406,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
+@@ -403,6 +409,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
  		seq++;
  	TCP_SKB_CB(skb)->end_seq = seq;
  }
@@ -2452,7 +2463,7 @@ index f99494637..6364d7c5f 100644
  
  static inline bool tcp_urg_mode(const struct tcp_sock *tp)
  {
-@@ -1428,7 +1432,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
+@@ -1428,7 +1435,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
   * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
   * otherwise socket can stall.
   */
@@ -2461,7 +2472,7 @@ index f99494637..6364d7c5f 100644
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  
-@@ -1439,9 +1443,10 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
+@@ -1439,9 +1446,10 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
  	sk_wmem_queued_add(sk, skb->truesize);
  	sk_mem_charge(sk, skb->truesize);
  }
@@ -2473,7 +2484,7 @@ index f99494637..6364d7c5f 100644
  {
  	if (skb->len <= mss_now) {
  		/* Avoid the costly divide in the normal
-@@ -1454,11 +1459,12 @@ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
+@@ -1454,11 +1462,12 @@ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
  		TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
  	}
  }
@@ -2487,7 +2498,7 @@ index f99494637..6364d7c5f 100644
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  
-@@ -1482,6 +1488,7 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
+@@ -1482,6 +1491,7 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
  
  	tcp_verify_left_out(tp);
  }
@@ -2495,7 +2506,7 @@ index f99494637..6364d7c5f 100644
  
  static bool tcp_has_tx_tstamp(const struct sk_buff *skb)
  {
-@@ -1489,7 +1496,7 @@ static bool tcp_has_tx_tstamp(const struct sk_buff *skb)
+@@ -1489,7 +1499,7 @@ static bool tcp_has_tx_tstamp(const struct sk_buff *skb)
  		(skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP);
  }
  
@@ -2504,7 +2515,7 @@ index f99494637..6364d7c5f 100644
  {
  	struct skb_shared_info *shinfo = skb_shinfo(skb);
  
-@@ -1505,12 +1512,14 @@ static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
+@@ -1505,12 +1515,14 @@ static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
  		TCP_SKB_CB(skb)->txstamp_ack = 0;
  	}
  }
@@ -2520,7 +2531,7 @@ index f99494637..6364d7c5f 100644
  
  /* Insert buff after skb on the write or rtx queue of sk.  */
  static void tcp_insert_write_queue_after(struct sk_buff *skb,
-@@ -1518,12 +1527,39 @@ static void tcp_insert_write_queue_after(struct sk_buff *skb,
+@@ -1518,12 +1530,39 @@ static void tcp_insert_write_queue_after(struct sk_buff *skb,
  					 struct sock *sk,
  					 enum tcp_queue tcp_queue)
  {
@@ -2560,7 +2571,7 @@ index f99494637..6364d7c5f 100644
  /* Function to create two new TCP segments.  Shrinks the given segment
   * to the specified size and appends a new segment with the rest of the
   * packet to the list.  This won't be called frequently, I hope.
-@@ -1561,7 +1597,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+@@ -1561,7 +1600,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
  		return -ENOMEM;
  	}
  
@@ -2569,7 +2580,7 @@ index f99494637..6364d7c5f 100644
  		return -ENOMEM;
  
  	/* Get a new skb... force flag on. */
-@@ -1575,6 +1611,9 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+@@ -1575,6 +1614,9 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
  	nlen = skb->len - len - nsize;
  	buff->truesize += nlen;
  	skb->truesize -= nlen;
@@ -2579,7 +2590,7 @@ index f99494637..6364d7c5f 100644
  
  	/* Correct the sequence numbers. */
  	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
-@@ -1670,7 +1709,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
+@@ -1670,7 +1712,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
  {
  	u32 delta_truesize;
  
@@ -2588,7 +2599,7 @@ index f99494637..6364d7c5f 100644
  		return -ENOMEM;
  
  	delta_truesize = __pskb_trim_head(skb, len);
-@@ -1848,6 +1887,7 @@ unsigned int tcp_current_mss(struct sock *sk)
+@@ -1848,6 +1890,7 @@ unsigned int tcp_current_mss(struct sock *sk)
  
  	return mss_now;
  }
@@ -2596,18 +2607,7 @@ index f99494637..6364d7c5f 100644
  
  /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
   * As additional protections, we do not touch cwnd in retransmission phases,
-@@ -2108,8 +2148,8 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
-  * know that all the data is in scatter-gather pages, and that the
-  * packet has never been sent out before (and thus is not cloned).
-  */
--static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
--			unsigned int mss_now, gfp_t gfp)
-+int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
-+		 unsigned int mss_now, gfp_t gfp)
- {
- 	int nlen = skb->len - len;
- 	struct sk_buff *buff;
-@@ -2129,6 +2169,9 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
+@@ -2129,6 +2172,9 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
  	sk_mem_charge(sk, buff->truesize);
  	buff->truesize += nlen;
  	skb->truesize -= nlen;
@@ -2617,15 +2617,7 @@ index f99494637..6364d7c5f 100644
  
  	/* Correct the sequence numbers. */
  	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
-@@ -2159,6 +2202,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
- 
- 	return 0;
- }
-+EXPORT_SYMBOL(tso_fragment);
- 
- /* Try to defer sending, if possible, in order to minimize the amount
-  * of TSO splitting we do.  View it as a kind of TSO Nagle test.
-@@ -2303,6 +2347,14 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
+@@ -2303,6 +2349,14 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
  
  		if (unlikely(TCP_SKB_CB(skb)->eor) || tcp_has_tx_tstamp(skb))
  			return false;
@@ -2640,57 +2632,46 @@ index f99494637..6364d7c5f 100644
  
  		len -= skb->len;
  	}
-@@ -2577,6 +2629,78 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
+@@ -2577,6 +2631,66 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
  		tcp_chrono_set(tp, TCP_CHRONO_BUSY);
  }
  
 +#ifdef CONFIG_SECURITY_TEMPESTA
 +
 +/**
-+ * The next two functions are called from places: from `tcp_write_xmit`
++ * The next funtion is called from places: from `tcp_write_xmit`
 + * (a usual case) and from `tcp_write_wakeup`. In other places where
 + * `tcp_transmit_skb` is called we deal with special TCP skbs or skbs
 + * not from tcp send queue.
 + */
 +static int
-+tcp_tfw_sk_prepare_xmit(struct sock *sk, struct sk_buff *skb,
-+			unsigned int mss_now, unsigned int *limit,
-+			unsigned int *nskbs)
-+{
-+	if (!sk->sk_prepare_xmit || !skb_tfw_tls_type(skb))
-+		return 0;
-+
-+	if (unlikely(*limit <= TLS_MAX_OVERHEAD)) {
-+		net_warn_ratelimited("%s: too small MSS %u"
-+				     " for TLS\n",
-+				     __func__, mss_now);
-+		return -ENOMEM;
-+	}
-+
-+	if (*limit > TLS_MAX_PAYLOAD_SIZE + TLS_MAX_OVERHEAD)
-+		*limit = TLS_MAX_PAYLOAD_SIZE;
-+	else
-+		*limit -= TLS_MAX_OVERHEAD;
-+
-+	if (unlikely(skb_tfw_flags(skb) & SS_F_HTTP2_FRAME_PREPARED)) {
-+		*nskbs = 1;
-+		return 0;
-+	}
-+
-+	return sk->sk_prepare_xmit(sk, skb, mss_now, limit, nskbs);
-+}
-+
-+static int
 +tcp_tfw_sk_write_xmit(struct sock *sk, struct sk_buff *skb,
-+		      unsigned int mss_now, unsigned int limit,
-+		      unsigned int nskbs)
++		      unsigned int mss_now)
 +{
++	struct tcp_sock *tp = tcp_sk(sk);
++	unsigned int in_flight = tcp_packets_in_flight(tp);
++	unsigned int send_win, cong_win;
++	unsigned int limit;
 +	int result;
 +
 +	if (!sk->sk_write_xmit || !skb_tfw_tls_type(skb))
 +		return 0;
 +
-+	result = sk->sk_write_xmit(sk, skb, mss_now, limit, nskbs);
++	/* Should be checked early. */
++	BUG_ON(after(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)));
++	cong_win = (tp->snd_cwnd - in_flight) * mss_now;
++	send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
++	/*
++	 * A receive side doesn’t start to process a TLS recod until
++	 * it’s fully read from a socket. Too small record size causes
++	 * too much overhead. On the other side too large record size
++	 * can lead to significant delays on receive side if current
++	 * TCP congestion and/or the receiver’s advertised window are
++	 * smaller than a TLS record size.
++	 */
++	limit = min3(cong_win, send_win, (unsigned int)TLS_MAX_PAYLOAD_SIZE);
++
++	result = sk->sk_write_xmit(sk, skb, mss_now, limit);
 +	if (unlikely(result))
 +		return result;
 +
@@ -2699,61 +2680,51 @@ index f99494637..6364d7c5f 100644
 +	return 0;
 +}
 +
-+/**
-+ * This function is similar to `tcp_write_err` except that we send
-+ * TCP RST to remote peer.  We call this function when an error occurs
-+ * while sending data from which we cannot recover, so we close the
-+ * connection with TCP RST.
++/*
++ * We should recalculate max_size, and split skb according
++ * new limit, because we add extra TLS_MAX_OVERHEAD bytes
++ * during tls encription. If we don't adjust it, we push
++ * skb with incorrect length to network.
 + */
-+static void
-+tcp_tfw_handle_error(struct sock *sk, int error)
-+{
-+	tcp_send_active_reset(sk, GFP_ATOMIC);
-+	sk->sk_err = error;
-+	sk->sk_error_report(sk);
-+	tcp_write_queue_purge(sk);
-+	tcp_done(sk);
-+}
++#define TFW_ADJUST_TLS_OVERHEAD(max_size)			\
++do {								\
++	if (max_size > TLS_MAX_PAYLOAD_SIZE + TLS_MAX_OVERHEAD)	\
++		max_size = TLS_MAX_PAYLOAD_SIZE;		\
++	else							\
++		max_size -= TLS_MAX_OVERHEAD;			\
++} while(0)
++
 +#endif
 +
  /* This routine writes packets to the network.  It advances the
   * send_head.  This happens as incoming acks open up the remote
   * window for us.
-@@ -2601,6 +2725,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
- 	int result;
- 	bool is_cwnd_limited = false, is_rwnd_limited = false;
- 	u32 max_segs;
-+#ifdef CONFIG_SECURITY_TEMPESTA
-+	unsigned int nskbs = UINT_MAX;
-+#endif
- 
- 	sent_pkts = 0;
- 
-@@ -2666,7 +2793,16 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+@@ -2666,7 +2780,17 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
  							  cwnd_quota,
  							  max_segs),
  						    nonagle);
 -
 +#ifdef CONFIG_SECURITY_TEMPESTA
-+		result = tcp_tfw_sk_prepare_xmit(sk, skb, mss_now, &limit,
-+						 &nskbs);
-+		if (unlikely(result)) {
-+			if (result == -ENOMEM)
-+				break; /* try again next time */
-+			tcp_tfw_handle_error(sk, result);
-+			return false;
++		if (sk->sk_write_xmit && skb_tfw_tls_type(skb)) {
++			if (unlikely(limit <= TLS_MAX_OVERHEAD)) {
++			    net_warn_ratelimited("%s: too small MSS %u"
++						 " for TLS\n",
++						 __func__, mss_now);
++				break;
++			}
++			TFW_ADJUST_TLS_OVERHEAD(limit);
 +		}
 +#endif
  		if (skb->len > limit &&
  		    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
  			break;
-@@ -2681,7 +2817,15 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+@@ -2681,7 +2805,15 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
  		 */
  		if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)
  			break;
 -
 +#ifdef CONFIG_SECURITY_TEMPESTA
-+		result = tcp_tfw_sk_write_xmit(sk, skb, mss_now, limit, nskbs);
++		result = tcp_tfw_sk_write_xmit(sk, skb, mss_now);
 +		if (unlikely(result)) {
 +			if (result == -ENOMEM)
 +				break; /* try again next time */
@@ -2764,7 +2735,7 @@ index f99494637..6364d7c5f 100644
  		if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
  			break;
  
-@@ -2866,6 +3010,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
+@@ -2866,6 +2998,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
  			   sk_gfp_mask(sk, GFP_ATOMIC)))
  		tcp_check_probe_timer(sk);
  }
@@ -2772,7 +2743,7 @@ index f99494637..6364d7c5f 100644
  
  /* Send _single_ skb sitting at the send head. This function requires
   * true push pending frames to setup probe timer etc.
-@@ -3183,7 +3328,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
+@@ -3183,7 +3316,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
  				 cur_mss, GFP_ATOMIC))
  			return -ENOMEM; /* We'll try again later. */
  	} else {
@@ -2781,7 +2752,7 @@ index f99494637..6364d7c5f 100644
  			return -ENOMEM;
  
  		diff = tcp_skb_pcount(skb);
-@@ -3421,6 +3566,7 @@ void tcp_send_fin(struct sock *sk)
+@@ -3421,6 +3554,7 @@ void tcp_send_fin(struct sock *sk)
  	}
  	__tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
  }
@@ -2789,7 +2760,7 @@ index f99494637..6364d7c5f 100644
  
  /* We get here when a process closes a file descriptor (either due to
   * an explicit close() or as a byproduct of exit()'ing) and there
-@@ -3454,6 +3600,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
+@@ -3454,6 +3588,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
  	 */
  	trace_tcp_send_reset(sk, NULL);
  }
@@ -2797,39 +2768,31 @@ index f99494637..6364d7c5f 100644
  
  /* Send a crossed SYN-ACK during socket establishment.
   * WARNING: This routine must only be called when we have already sent
-@@ -4030,6 +4177,9 @@ int tcp_write_wakeup(struct sock *sk, int mib)
- 
- 	skb = tcp_send_head(sk);
- 	if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
-+#ifdef CONFIG_SECURITY_TEMPESTA
-+		unsigned int nskbs = UINT_MAX;
-+#endif
- 		int err;
- 		unsigned int mss = tcp_current_mss(sk);
- 		unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
-@@ -4037,6 +4187,15 @@ int tcp_write_wakeup(struct sock *sk, int mib)
- 		if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
- 			tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
- 
-+#ifdef CONFIG_SECURITY_TEMPESTA
-+		err = tcp_tfw_sk_prepare_xmit(sk, skb, mss, &seg_size, &nskbs);
-+		if (unlikely(err)) {
-+			if (err != -ENOMEM)
-+				tcp_tfw_handle_error(sk, err);
-+			return err;
-+		}
+@@ -4044,6 +4179,17 @@ int tcp_write_wakeup(struct sock *sk, int mib)
+ 		if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
+ 		    skb->len > mss) {
+ 			seg_size = min(seg_size, mss);
++#ifdef CONFIG_SECURITY_TEMPESTA
++			if (sk->sk_write_xmit && skb_tfw_tls_type(skb)) {
++				if (unlikely(seg_size <= TLS_MAX_OVERHEAD)) {
++					net_warn_ratelimited("%s: too small"
++							     " MSS %u for TLS\n",
++							     __func__, mss);
++					return -ENOMEM;
++				}
++				TFW_ADJUST_TLS_OVERHEAD(seg_size);
++			}
 +#endif
-+
- 		/* We are probing the opening of a window
- 		 * but the window size is != 0
- 		 * must have been a result SWS avoidance ( sender )
-@@ -4052,6 +4211,16 @@ int tcp_write_wakeup(struct sock *sk, int mib)
+ 			TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
+ 			if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
+ 					 skb, seg_size, mss, GFP_ATOMIC))
+@@ -4052,6 +4198,16 @@ int tcp_write_wakeup(struct sock *sk, int mib)
  			tcp_set_skb_tso_segs(skb, mss);
  
  		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
 +
 +#ifdef CONFIG_SECURITY_TEMPESTA
-+		err = tcp_tfw_sk_write_xmit(sk, skb, mss, seg_size, nskbs);
++		err = tcp_tfw_sk_write_xmit(sk, skb, mss);
 +		if (unlikely(err)) {
 +			if (err != -ENOMEM)
 +				tcp_tfw_handle_error(sk, err);
diff --git a/tls/mpool.c b/tls/mpool.c
index 0516d058e..c05387b3e 100644
--- a/tls/mpool.c
+++ b/tls/mpool.c
@@ -19,7 +19,7 @@
  * implicitly for MPI math. Dynamically allocated pages are used instead of
  * static per-cpu ones.
  *
- * Copyright (C) 2019-2022 Tempesta Technologies, Inc.
+ * Copyright (C) 2019-2024 Tempesta Technologies, Inc.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -397,7 +397,7 @@ ttls_mpool_exit(void)
 	int i;
 	TlsMpiPool *mp;
 
-	for_each_possible_cpu(i) {
+	for_each_online_cpu(i) {
 		mp = per_cpu(g_tmp_mpool, i);
 		ttls_bzero_safe(MPI_POOL_DATA(mp), mp->curr - sizeof(*mp));
 		free_pages((unsigned long)mp, mp->order);
@@ -409,7 +409,7 @@ ttls_mpool_init(void)
 {
 	int cpu;
 
-	for_each_possible_cpu(cpu) {
+	for_each_online_cpu(cpu) {
 		TlsMpiPool **mp = per_cpu_ptr(&g_tmp_mpool, cpu);
 		if (!(*mp = ttls_mpi_pool_create(__MPOOL_STACK_ORDER,
 						 GFP_KERNEL)))
diff --git a/tls/rsa.c b/tls/rsa.c
index 121d1d576..1cea5539d 100644
--- a/tls/rsa.c
+++ b/tls/rsa.c
@@ -24,7 +24,7 @@
  * Based on mbed TLS, https://tls.mbed.org.
  *
  * Copyright (C) 2006-2015, ARM Limited, All Rights Reserved
- * Copyright (C) 2015-2021 Tempesta Technologies, Inc.
+ * Copyright (C) 2015-2024 Tempesta Technologies, Inc.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -133,7 +133,7 @@ __rsa_setup_ctx(TlsRSACtx *ctx)
 	 * Generate blinding values.
 	 * Unblinding value: Vf = random number, invertible mod N.
 	 */
-	for_each_possible_cpu(cpu) {
+	for_each_online_cpu(cpu) {
 		int count = 0;
 
 		TlsMpi *vi = per_cpu_ptr(ctx->Vi, cpu);
diff --git a/tls/ttls.c b/tls/ttls.c
index 9ccdd8744..0cce8f1e1 100644
--- a/tls/ttls.c
+++ b/tls/ttls.c
@@ -8,7 +8,7 @@
  * Based on mbed TLS, https://tls.mbed.org.
  *
  * Copyright (C) 2006-2015, ARM Limited, All Rights Reserved
- * Copyright (C) 2015-2023 Tempesta Technologies, Inc.
+ * Copyright (C) 2015-2024 Tempesta Technologies, Inc.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -2824,7 +2824,7 @@ ttls_exit(void)
 
 	kmem_cache_destroy(ttls_hs_cache);
 
-	for_each_possible_cpu(cpu) {
+	for_each_online_cpu(cpu) {
 		struct aead_request **req = per_cpu_ptr(&g_req, cpu);
 		kfree(*req);
 	}
@@ -2852,7 +2852,7 @@ ttls_init(void)
 	if ((r = ttls_tickets_init()))
 		goto err_free;
 
-	for_each_possible_cpu(cpu) {
+	for_each_online_cpu(cpu) {
 		struct aead_request **req = per_cpu_ptr(&g_req, cpu);
 		*req = kmalloc(ttls_aead_reqsize(), GFP_KERNEL);
 		if (!*req)

From f8de856a855ea80aed656e192f482bf213cd868a Mon Sep 17 00:00:00 2001
From: kingluo <home_king@163.com>
Date: Thu, 4 Jul 2024 21:05:14 +0800
Subject: [PATCH 19/25] update linux-6.8.9.patch

---
 linux-6.8.9.patch | 432 +++++++++++++++++++++++++++++-----------------
 1 file changed, 278 insertions(+), 154 deletions(-)

diff --git a/linux-6.8.9.patch b/linux-6.8.9.patch
index a339f9bc1..6966d74de 100644
--- a/linux-6.8.9.patch
+++ b/linux-6.8.9.patch
@@ -549,7 +549,7 @@ index dba428b3a..20f90054f 100644
  
  /*
 diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
-index 5bafcfe18..b05a07d99 100644
+index 5bafcfe18..becadd84a 100644
 --- a/include/linux/skbuff.h
 +++ b/include/linux/skbuff.h
 @@ -266,6 +266,12 @@
@@ -565,7 +565,7 @@ index 5bafcfe18..b05a07d99 100644
  
  /* return minimum truesize of one skb containing X bytes of data */
  #define SKB_TRUESIZE(X) ((X) +						\
-@@ -861,6 +867,14 @@ struct sk_buff {
+@@ -861,6 +867,13 @@ struct sk_buff {
  				 * UDP receive path is one user.
  				 */
  				unsigned long		dev_scratch;
@@ -573,14 +573,13 @@ index 5bafcfe18..b05a07d99 100644
 +                                struct {
 +                                        __u8    present : 1;
 +                                        __u8    tls_type : 7;
-+                                        __u16   flags : 16;
 +                                        unsigned int cb;
 +                                } tfw_cb;
 +#endif
  			};
  		};
  		struct rb_node		rbnode; /* used in netem, ip4 defrag, and tcp stack */
-@@ -922,11 +936,17 @@ struct sk_buff {
+@@ -922,11 +935,17 @@ struct sk_buff {
  				fclone:2,
  				peeked:1,
  				head_frag:1,
@@ -593,12 +592,12 @@ index 5bafcfe18..b05a07d99 100644
  	__u8			active_extensions;
  #endif
 +#ifdef CONFIG_SECURITY_TEMPESTA
-+        __u8                    tail_lock:1;
++	__u8			tail_lock:1;
 +#endif
  
  	/* Fields enclosed in headers group are copied
  	 * using a single memcpy() in __copy_skb_header()
-@@ -1096,6 +1116,98 @@ struct sk_buff {
+@@ -1096,6 +1115,79 @@ struct sk_buff {
  #define SKB_ALLOC_RX		0x02
  #define SKB_ALLOC_NAPI		0x04
  
@@ -636,9 +635,9 @@ index 5bafcfe18..b05a07d99 100644
 +static inline void
 +skb_set_tfw_tls_type(struct sk_buff *skb, unsigned char tls_type)
 +{
-+        BUG_ON(tls_type > 0x7F);
-+        skb->tfw_cb.present = 1;
-+        skb->tfw_cb.tls_type = tls_type;
++	BUG_ON(tls_type > 0x7F);
++	skb->tfw_cb.present = 1;
++	skb->tfw_cb.tls_type = tls_type;
 +}
 +
 +static inline unsigned char
@@ -648,25 +647,6 @@ index 5bafcfe18..b05a07d99 100644
 +}
 +
 +static inline void
-+skb_set_tfw_flags(struct sk_buff *skb, unsigned short flags)
-+{
-+        skb->tfw_cb.present = 1;
-+        skb->tfw_cb.flags |= flags;
-+}
-+
-+static inline void
-+skb_clear_tfw_flag(struct sk_buff *skb, unsigned short flag)
-+{
-+        skb->tfw_cb.flags &= ~flag;
-+}
-+
-+static inline unsigned short
-+skb_tfw_flags(struct sk_buff *skb)
-+{
-+        return skb->tfw_cb.present ? skb->tfw_cb.flags : 0;
-+}
-+
-+static inline void
 +skb_set_tfw_cb(struct sk_buff *skb, unsigned int cb)
 +{
 +        skb->tfw_cb.present = 1;
@@ -697,7 +677,7 @@ index 5bafcfe18..b05a07d99 100644
  /**
   * skb_pfmemalloc - Test if the skb was allocated from PFMEMALLOC reserves
   * @skb: buffer
-@@ -1267,6 +1379,7 @@ void kfree_skb_partial(struct sk_buff *skb, bool head_stolen);
+@@ -1267,6 +1359,7 @@ void kfree_skb_partial(struct sk_buff *skb, bool head_stolen);
  bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
  		      bool *fragstolen, int *delta_truesize);
  
@@ -705,7 +685,7 @@ index 5bafcfe18..b05a07d99 100644
  struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags,
  			    int node);
  struct sk_buff *__build_skb(void *data, unsigned int frag_size);
-@@ -2402,7 +2515,11 @@ struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list);
+@@ -2402,7 +2495,11 @@ struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list);
  
  static inline bool skb_is_nonlinear(const struct sk_buff *skb)
  {
@@ -717,7 +697,7 @@ index 5bafcfe18..b05a07d99 100644
  }
  
  static inline unsigned int skb_headlen(const struct sk_buff *skb)
-@@ -2714,6 +2831,20 @@ static inline unsigned int skb_headroom(const struct sk_buff *skb)
+@@ -2714,6 +2811,20 @@ static inline unsigned int skb_headroom(const struct sk_buff *skb)
  	return skb->data - skb->head;
  }
  
@@ -817,40 +797,53 @@ index d94c242eb..90b5f794c 100644
  	union {
  		struct ip_options_rcu __rcu	*ireq_opt;
 diff --git a/include/net/sock.h b/include/net/sock.h
-index 54a796761..8c679819d 100644
+index 54a796761..cb0e3f851 100644
 --- a/include/net/sock.h
 +++ b/include/net/sock.h
-@@ -513,6 +513,19 @@ struct sock {
+@@ -513,6 +513,31 @@ struct sock {
  	void			(*sk_state_change)(struct sock *sk);
  	void			(*sk_data_ready)(struct sock *sk);
  	void			(*sk_write_space)(struct sock *sk);
 +#ifdef CONFIG_SECURITY_TEMPESTA
-+	int			(*sk_prepare_xmit)(struct sock *sk,
-+						   struct sk_buff *skb,
-+						   unsigned int mss_now,
-+						   unsigned int *limit,
-+						   unsigned int *skbs);
++				/*
++				 * Tempesta FW callback to ecrypt one
++				 * or more skb in socket write queue
++				 * before sending.
++				 */
 +	int			(*sk_write_xmit)(struct sock *sk,
 +						 struct sk_buff *skb,
 +						 unsigned int mss_now,
-+						 unsigned int limit,
-+						 unsigned int skbs);
++						 unsigned int limit);
++				/*
++				 * Tempesta FW callback to prepare and push
++				 * skbs from Tempesta FW private scheduler
++				 * to socket write queue according sender
++				 * and receiver window.
++				 */
++	int			(*sk_fill_write_queue)(struct sock *sk,
++						       unsigned int mss_now,
++						       int ss_action);
++				/*
++				 * Tempesta FW callback to free all private
++				 * resources associated with socket.
++				 */
 +	void			(*sk_destroy_cb)(struct sock *sk);
 +#endif
  	void			(*sk_error_report)(struct sock *sk);
  	int			(*sk_backlog_rcv)(struct sock *sk,
  						  struct sk_buff *skb);
-@@ -930,6 +943,9 @@ enum sock_flags {
+@@ -930,6 +955,10 @@ enum sock_flags {
  	SOCK_XDP, /* XDP is attached */
  	SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */
  	SOCK_RCVMARK, /* Receive SO_MARK  ancillary data with packet */
 +#ifdef CONFIG_SECURITY_TEMPESTA
 +	SOCK_TEMPESTA, /* The socket is managed by Tempesta FW */
++	SOCK_TEMPESTA_HAS_DATA /* The socket has data in Tempesta FW write queue */
 +#endif
  };
  
  #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
-@@ -1174,6 +1190,16 @@ static inline void sock_rps_reset_rxhash(struct sock *sk)
+@@ -1174,6 +1203,16 @@ static inline void sock_rps_reset_rxhash(struct sock *sk)
  		__rc;							\
  	})
  
@@ -867,7 +860,7 @@ index 54a796761..8c679819d 100644
  int sk_stream_wait_connect(struct sock *sk, long *timeo_p);
  int sk_stream_wait_memory(struct sock *sk, long *timeo_p);
  void sk_stream_wait_close(struct sock *sk, long timeo_p);
-@@ -2136,8 +2162,7 @@ static inline bool sk_rethink_txhash(struct sock *sk)
+@@ -2136,8 +2175,7 @@ static inline bool sk_rethink_txhash(struct sock *sk)
  static inline struct dst_entry *
  __sk_dst_get(const struct sock *sk)
  {
@@ -878,7 +871,7 @@ index 54a796761..8c679819d 100644
  
  static inline struct dst_entry *
 diff --git a/include/net/tcp.h b/include/net/tcp.h
-index f6eba9652..4c9e00994 100644
+index f6eba9652..0c630838b 100644
 --- a/include/net/tcp.h
 +++ b/include/net/tcp.h
 @@ -318,6 +318,7 @@ bool tcp_check_oom(struct sock *sk, int shift);
@@ -889,7 +882,16 @@ index f6eba9652..4c9e00994 100644
  
  #define TCP_INC_STATS(net, field)	SNMP_INC_STATS((net)->mib.tcp_statistics, field)
  #define __TCP_INC_STATS(net, field)	__SNMP_INC_STATS((net)->mib.tcp_statistics, field)
-@@ -615,6 +616,8 @@ enum tcp_queue {
+@@ -359,6 +360,8 @@ ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos,
+ 			unsigned int flags);
+ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp,
+ 				     bool force_schedule);
++struct sk_buff *tcp_stream_alloc_skb_size(struct sock *sk, int size, gfp_t gfp,
++				     bool force_schedule);
+ 
+ static inline void tcp_dec_quickack_mode(struct sock *sk)
+ {
+@@ -615,6 +618,8 @@ enum tcp_queue {
  	TCP_FRAG_IN_WRITE_QUEUE,
  	TCP_FRAG_IN_RTX_QUEUE,
  };
@@ -898,7 +900,7 @@ index f6eba9652..4c9e00994 100644
  int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
  		 struct sk_buff *skb, u32 len,
  		 unsigned int mss_now, gfp_t gfp);
-@@ -684,6 +687,22 @@ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
+@@ -684,6 +689,21 @@ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
  /* tcp.c */
  void tcp_get_info(struct sock *, struct tcp_info *);
  
@@ -916,11 +918,62 @@ index f6eba9652..4c9e00994 100644
 +extern void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2);
 +extern void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2);
 +extern int tcp_close_state(struct sock *sk);
-+extern void skb_entail(struct sock *sk, struct sk_buff *skb);
 +
  /* Read 'sendfile()'-style from a TCP socket */
  int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
  		  sk_read_actor_t recv_actor);
+@@ -2053,11 +2073,51 @@ static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct soc
+ 	tcp_wmem_free_skb(sk, skb);
+ }
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++/**
++ * This function is similar to `tcp_write_err` except that we send
++ * TCP RST to remote peer.  We call this function when an error occurs
++ * while sending data from which we cannot recover, so we close the
++ * connection with TCP RST.
++ */
++static inline void
++tcp_tfw_handle_error(struct sock *sk, int error)
++{
++	tcp_send_active_reset(sk, GFP_ATOMIC);
++	sk->sk_err = error;
++	sk->sk_error_report(sk);
++	tcp_write_queue_purge(sk);
++	tcp_done(sk);
++}
++#endif
++
+ static inline void tcp_push_pending_frames(struct sock *sk)
+ {
++#ifdef CONFIG_SECURITY_TEMPESTA
++	unsigned int mss_now = 0;
++
++	if (sock_flag(sk, SOCK_TEMPESTA_HAS_DATA)
++	    && sk->sk_fill_write_queue)
++	{
++		int result;
++
++		mss_now = tcp_current_mss(sk);
++		result = sk->sk_fill_write_queue(sk, mss_now, 0);
++		if (unlikely(result < 0 && result != -ENOMEM)) {
++			tcp_tfw_handle_error(sk, result);
++			return;
++		}
++	}
++#endif
+ 	if (tcp_send_head(sk)) {
+ 		struct tcp_sock *tp = tcp_sk(sk);
+ 
++#ifdef CONFIG_SECURITY_TEMPESTA
++		if (mss_now != 0) {
++			int nonagle = TCP_NAGLE_OFF | TCP_NAGLE_PUSH;
++			__tcp_push_pending_frames(sk, mss_now, nonagle);
++		} else
++#endif
+ 		__tcp_push_pending_frames(sk, tcp_current_mss(sk), tp->nonagle);
+ 	}
+ }
 diff --git a/include/net/tls.h b/include/net/tls.h
 index 33f657d3c..1b5286933 100644
 --- a/include/net/tls.h
@@ -2201,7 +2254,7 @@ index 67d846622..c54232188 100644
  	skb_reset_network_header(skb2);
  	skb2->transport_header = skb2->network_header + state->hlen;
 diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
-index 5887eac87..805d2561d 100644
+index 5887eac87..ae8be8385 100644
 --- a/net/ipv4/tcp.c
 +++ b/net/ipv4/tcp.c
 @@ -459,7 +459,9 @@ void tcp_init_sock(struct sock *sk)
@@ -2256,15 +2309,46 @@ index 5887eac87..805d2561d 100644
  
  static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
  				unsigned int offset, size_t len)
-@@ -893,6 +906,7 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp,
+@@ -893,6 +906,38 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp,
  	}
  	return NULL;
  }
 +EXPORT_SYMBOL(tcp_stream_alloc_skb);
++
++struct sk_buff *tcp_stream_alloc_skb_size(struct sock *sk, int size, gfp_t gfp,
++				     bool force_schedule)
++{
++	struct sk_buff *skb;
++
++	skb = alloc_skb_fclone(MAX_TCP_HEADER + size, gfp);
++	if (likely(skb)) {
++		bool mem_scheduled;
++
++		skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
++		if (force_schedule) {
++			mem_scheduled = true;
++			sk_forced_mem_schedule(sk, skb->truesize);
++		} else {
++			mem_scheduled = sk_wmem_schedule(sk, skb->truesize);
++		}
++		if (likely(mem_scheduled)) {
++			skb_reserve(skb, MAX_TCP_HEADER);
++			skb->ip_summed = CHECKSUM_PARTIAL;
++			INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
++			return skb;
++		}
++		__kfree_skb(skb);
++	} else {
++		sk->sk_prot->enter_memory_pressure(sk);
++		sk_stream_moderate_sndbuf(sk);
++	}
++	return NULL;
++}
++EXPORT_SYMBOL(tcp_stream_alloc_skb_size);
  
  static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
  				       int large_allowed)
-@@ -927,6 +941,7 @@ int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
+@@ -927,6 +972,7 @@ int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
  
  	return mss_now;
  }
@@ -2272,7 +2356,7 @@ index 5887eac87..805d2561d 100644
  
  /* In some cases, sendmsg() could have added an skb to the write queue,
   * but failed adding payload on it. We need to remove it to consume less
-@@ -1513,6 +1528,7 @@ static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
+@@ -1513,6 +1559,7 @@ static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
  	}
  	__kfree_skb(skb);
  }
@@ -2280,7 +2364,7 @@ index 5887eac87..805d2561d 100644
  
  struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
  {
-@@ -2682,7 +2698,7 @@ static const unsigned char new_state[16] = {
+@@ -2682,7 +2729,7 @@ static const unsigned char new_state[16] = {
    [TCP_NEW_SYN_RECV]	= TCP_CLOSE,	/* should not happen ! */
  };
  
@@ -2289,7 +2373,7 @@ index 5887eac87..805d2561d 100644
  {
  	int next = (int)new_state[sk->sk_state];
  	int ns = next & TCP_STATE_MASK;
-@@ -2691,6 +2707,7 @@ static int tcp_close_state(struct sock *sk)
+@@ -2691,6 +2738,7 @@ static int tcp_close_state(struct sock *sk)
  
  	return next & TCP_ACTION_FIN;
  }
@@ -2297,7 +2381,7 @@ index 5887eac87..805d2561d 100644
  
  /*
   *	Shutdown the sending side of a connection. Much like close except
-@@ -2726,6 +2743,7 @@ int tcp_orphan_count_sum(void)
+@@ -2726,6 +2774,7 @@ int tcp_orphan_count_sum(void)
  
  	return max(total, 0);
  }
@@ -2305,7 +2389,7 @@ index 5887eac87..805d2561d 100644
  
  static int tcp_orphan_cache;
  static struct timer_list tcp_orphan_timer;
-@@ -2977,6 +2995,7 @@ void tcp_write_queue_purge(struct sock *sk)
+@@ -2977,6 +3026,7 @@ void tcp_write_queue_purge(struct sock *sk)
  	tcp_sk(sk)->packets_out = 0;
  	inet_csk(sk)->icsk_backoff = 0;
  }
@@ -2313,7 +2397,7 @@ index 5887eac87..805d2561d 100644
  
  int tcp_disconnect(struct sock *sk, int flags)
  {
-@@ -4507,10 +4526,15 @@ void tcp_done(struct sock *sk)
+@@ -4507,10 +4557,15 @@ void tcp_done(struct sock *sk)
  
  	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
  
@@ -2433,7 +2517,7 @@ index 0ecc7311d..7fd712ed5 100644
  		return NULL;
  	}
 diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
-index e3167ad96..dcc382ae6 100644
+index e3167ad96..7fcf8d4b2 100644
 --- a/net/ipv4/tcp_output.c
 +++ b/net/ipv4/tcp_output.c
 @@ -39,6 +39,9 @@
@@ -2446,7 +2530,17 @@ index e3167ad96..dcc382ae6 100644
  
  #include <linux/compiler.h>
  #include <linux/gfp.h>
-@@ -396,7 +399,7 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
+@@ -156,6 +159,9 @@ void tcp_cwnd_restart(struct sock *sk, s32 delta)
+ 	tp->snd_cwnd_stamp = tcp_jiffies32;
+ 	tp->snd_cwnd_used = 0;
+ }
++#ifdef CONFIG_SECURITY_TEMPESTA
++EXPORT_SYMBOL(tcp_cwnd_restart);
++#endif
+ 
+ /* Congestion state accounting after a packet has been sent. */
+ static void tcp_event_data_sent(struct tcp_sock *tp,
+@@ -396,7 +402,7 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
  /* Constructs common control bits of non-data skb. If SYN/FIN is present,
   * auto increment end seqno.
   */
@@ -2455,7 +2549,7 @@ index e3167ad96..dcc382ae6 100644
  {
  	skb->ip_summed = CHECKSUM_PARTIAL;
  
-@@ -409,6 +412,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
+@@ -409,6 +415,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
  		seq++;
  	TCP_SKB_CB(skb)->end_seq = seq;
  }
@@ -2463,7 +2557,7 @@ index e3167ad96..dcc382ae6 100644
  
  static inline bool tcp_urg_mode(const struct tcp_sock *tp)
  {
-@@ -1486,7 +1490,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
+@@ -1486,7 +1493,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
   * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
   * otherwise socket can stall.
   */
@@ -2472,7 +2566,7 @@ index e3167ad96..dcc382ae6 100644
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  
-@@ -1497,9 +1501,10 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
+@@ -1497,9 +1504,10 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
  	sk_wmem_queued_add(sk, skb->truesize);
  	sk_mem_charge(sk, skb->truesize);
  }
@@ -2484,7 +2578,7 @@ index e3167ad96..dcc382ae6 100644
  {
  	if (skb->len <= mss_now) {
  		/* Avoid the costly divide in the normal
-@@ -1512,11 +1517,12 @@ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
+@@ -1512,11 +1520,12 @@ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
  		TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
  	}
  }
@@ -2498,7 +2592,7 @@ index e3167ad96..dcc382ae6 100644
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  
-@@ -1540,6 +1546,7 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
+@@ -1540,6 +1549,7 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
  
  	tcp_verify_left_out(tp);
  }
@@ -2506,7 +2600,7 @@ index e3167ad96..dcc382ae6 100644
  
  static bool tcp_has_tx_tstamp(const struct sk_buff *skb)
  {
-@@ -1547,7 +1554,7 @@ static bool tcp_has_tx_tstamp(const struct sk_buff *skb)
+@@ -1547,7 +1557,7 @@ static bool tcp_has_tx_tstamp(const struct sk_buff *skb)
  		(skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP);
  }
  
@@ -2515,7 +2609,7 @@ index e3167ad96..dcc382ae6 100644
  {
  	struct skb_shared_info *shinfo = skb_shinfo(skb);
  
-@@ -1563,12 +1570,14 @@ static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
+@@ -1563,12 +1573,14 @@ static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
  		TCP_SKB_CB(skb)->txstamp_ack = 0;
  	}
  }
@@ -2531,7 +2625,7 @@ index e3167ad96..dcc382ae6 100644
  
  /* Insert buff after skb on the write or rtx queue of sk.  */
  static void tcp_insert_write_queue_after(struct sk_buff *skb,
-@@ -1576,12 +1585,39 @@ static void tcp_insert_write_queue_after(struct sk_buff *skb,
+@@ -1576,12 +1588,39 @@ static void tcp_insert_write_queue_after(struct sk_buff *skb,
  					 struct sock *sk,
  					 enum tcp_queue tcp_queue)
  {
@@ -2571,7 +2665,18 @@ index e3167ad96..dcc382ae6 100644
  /* Function to create two new TCP segments.  Shrinks the given segment
   * to the specified size and appends a new segment with the rest of the
   * packet to the list.  This won't be called frequently, I hope.
-@@ -1617,7 +1653,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+@@ -1597,6 +1636,10 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+ 	long limit;
+ 	int nlen;
+ 	u8 flags;
++	int nsize = skb_headlen(skb) - len;
++
++	if (nsize < 0)
++		nsize = 0;
+ 
+ 	if (WARN_ON(len > skb->len))
+ 		return -EINVAL;
+@@ -1617,11 +1660,11 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
  		return -ENOMEM;
  	}
  
@@ -2580,7 +2685,12 @@ index e3167ad96..dcc382ae6 100644
  		return -ENOMEM;
  
  	/* Get a new skb... force flag on. */
-@@ -1632,6 +1668,9 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+-	buff = tcp_stream_alloc_skb(sk, gfp, true);
++	buff = tcp_stream_alloc_skb_size(sk, nsize, gfp, true);
+ 	if (!buff)
+ 		return -ENOMEM; /* We'll just try again later. */
+ 	skb_copy_decrypted(buff, skb);
+@@ -1632,6 +1675,9 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
  	nlen = skb->len - len;
  	buff->truesize += nlen;
  	skb->truesize -= nlen;
@@ -2590,7 +2700,22 @@ index e3167ad96..dcc382ae6 100644
  
  	/* Correct the sequence numbers. */
  	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
-@@ -1719,7 +1758,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
+@@ -1687,7 +1733,13 @@ static int __pskb_trim_head(struct sk_buff *skb, int len)
+ 	struct skb_shared_info *shinfo;
+ 	int i, k, eat;
+ 
+-	DEBUG_NET_WARN_ON_ONCE(skb_headlen(skb));
++	eat = min_t(int, len, skb_headlen(skb));
++	if (eat) {
++		__skb_pull(skb, eat);
++		len -= eat;
++		if (!len)
++			return 0;
++	}
+ 	eat = len;
+ 	k = 0;
+ 	shinfo = skb_shinfo(skb);
+@@ -1719,7 +1771,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
  {
  	u32 delta_truesize;
  
@@ -2599,7 +2724,7 @@ index e3167ad96..dcc382ae6 100644
  		return -ENOMEM;
  
  	delta_truesize = __pskb_trim_head(skb, len);
-@@ -1879,6 +1918,7 @@ unsigned int tcp_current_mss(struct sock *sk)
+@@ -1879,6 +1931,7 @@ unsigned int tcp_current_mss(struct sock *sk)
  
  	return mss_now;
  }
@@ -2607,7 +2732,7 @@ index e3167ad96..dcc382ae6 100644
  
  /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
   * As additional protections, we do not touch cwnd in retransmission phases,
-@@ -2153,8 +2193,8 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
+@@ -2153,13 +2206,16 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
   * know that all the data is in scatter-gather pages, and that the
   * packet has never been sent out before (and thus is not cloned).
   */
@@ -2618,7 +2743,15 @@ index e3167ad96..dcc382ae6 100644
  {
  	int nlen = skb->len - len;
  	struct sk_buff *buff;
-@@ -2173,6 +2213,9 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
+ 	u8 flags;
+ 
++	if (skb->len != skb->data_len)
++		return tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, skb, len, mss_now, gfp);
++
+ 	/* All of a TSO frame must be composed of paged data.  */
+ 	DEBUG_NET_WARN_ON_ONCE(skb->len != skb->data_len);
+ 
+@@ -2173,6 +2229,9 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
  	sk_mem_charge(sk, buff->truesize);
  	buff->truesize += nlen;
  	skb->truesize -= nlen;
@@ -2628,7 +2761,7 @@ index e3167ad96..dcc382ae6 100644
  
  	/* Correct the sequence numbers. */
  	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
-@@ -2199,6 +2242,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
+@@ -2199,6 +2258,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
  
  	return 0;
  }
@@ -2636,7 +2769,7 @@ index e3167ad96..dcc382ae6 100644
  
  /* Try to defer sending, if possible, in order to minimize the amount
   * of TSO splitting we do.  View it as a kind of TSO Nagle test.
-@@ -2345,6 +2389,14 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
+@@ -2345,6 +2405,14 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
  		    tcp_has_tx_tstamp(skb) ||
  		    !skb_pure_zcopy_same(skb, next))
  			return false;
@@ -2651,57 +2784,46 @@ index e3167ad96..dcc382ae6 100644
  
  		len -= skb->len;
  	}
-@@ -2683,6 +2735,78 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
+@@ -2683,6 +2751,66 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
  		tcp_chrono_set(tp, TCP_CHRONO_BUSY);
  }
  
 +#ifdef CONFIG_SECURITY_TEMPESTA
 +
 +/**
-+ * The next two functions are called from places: from `tcp_write_xmit`
++ * The next funtion is called from places: from `tcp_write_xmit`
 + * (a usual case) and from `tcp_write_wakeup`. In other places where
 + * `tcp_transmit_skb` is called we deal with special TCP skbs or skbs
 + * not from tcp send queue.
 + */
 +static int
-+tcp_tfw_sk_prepare_xmit(struct sock *sk, struct sk_buff *skb,
-+			unsigned int mss_now, unsigned int *limit,
-+			unsigned int *nskbs)
-+{
-+	if (!sk->sk_prepare_xmit || !skb_tfw_tls_type(skb))
-+		return 0;
-+
-+	if (unlikely(*limit <= TLS_MAX_OVERHEAD)) {
-+		net_warn_ratelimited("%s: too small MSS %u"
-+				     " for TLS\n",
-+				     __func__, mss_now);
-+		return -ENOMEM;
-+	}
-+
-+	if (*limit > TLS_MAX_PAYLOAD_SIZE + TLS_MAX_OVERHEAD)
-+		*limit = TLS_MAX_PAYLOAD_SIZE;
-+	else
-+		*limit -= TLS_MAX_OVERHEAD;
-+
-+	if (unlikely(skb_tfw_flags(skb) & SS_F_HTTP2_FRAME_PREPARED)) {
-+		*nskbs = 1;
-+		return 0;
-+	}
-+
-+	return sk->sk_prepare_xmit(sk, skb, mss_now, limit, nskbs);
-+}
-+
-+static int
 +tcp_tfw_sk_write_xmit(struct sock *sk, struct sk_buff *skb,
-+		      unsigned int mss_now, unsigned int limit,
-+		      unsigned int nskbs)
++		      unsigned int mss_now)
 +{
++	struct tcp_sock *tp = tcp_sk(sk);
++	unsigned int in_flight = tcp_packets_in_flight(tp);
++	unsigned int send_win, cong_win;
++	unsigned int limit;
 +	int result;
 +
 +	if (!sk->sk_write_xmit || !skb_tfw_tls_type(skb))
 +		return 0;
 +
-+	result = sk->sk_write_xmit(sk, skb, mss_now, limit, nskbs);
++	/* Should be checked early. */
++	BUG_ON(after(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)));
++	cong_win = (tp->snd_cwnd - in_flight) * mss_now;
++	send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
++	/*
++	 * A receive side doesn’t start to process a TLS recod until
++	 * it’s fully read from a socket. Too small record size causes
++	 * too much overhead. On the other side too large record size
++	 * can lead to significant delays on receive side if current
++	 * TCP congestion and/or the receiver’s advertised window are
++	 * smaller than a TLS record size.
++	 */
++	limit = min3(cong_win, send_win, (unsigned int)TLS_MAX_PAYLOAD_SIZE);
++
++	result = sk->sk_write_xmit(sk, skb, mss_now, limit);
 +	if (unlikely(result))
 +		return result;
 +
@@ -2710,27 +2832,26 @@ index e3167ad96..dcc382ae6 100644
 +	return 0;
 +}
 +
-+/**
-+ * This function is similar to `tcp_write_err` except that we send
-+ * TCP RST to remote peer.  We call this function when an error occurs
-+ * while sending data from which we cannot recover, so we close the
-+ * connection with TCP RST.
++/*
++ * We should recalculate max_size, and split skb according
++ * new limit, because we add extra TLS_MAX_OVERHEAD bytes
++ * during tls encription. If we don't adjust it, we push
++ * skb with incorrect length to network.
 + */
-+static void
-+tcp_tfw_handle_error(struct sock *sk, int error)
-+{
-+	tcp_send_active_reset(sk, GFP_ATOMIC);
-+	sk->sk_err = error;
-+	sk->sk_error_report(sk);
-+	tcp_write_queue_purge(sk);
-+	tcp_done(sk);
-+}
++#define TFW_ADJUST_TLS_OVERHEAD(max_size)			\
++do {								\
++	if (max_size > TLS_MAX_PAYLOAD_SIZE + TLS_MAX_OVERHEAD)	\
++		max_size = TLS_MAX_PAYLOAD_SIZE;		\
++	else							\
++		max_size -= TLS_MAX_OVERHEAD;			\
++} while(0)
++
 +#endif
 +
  /* This routine writes packets to the network.  It advances the
   * send_head.  This happens as incoming acks open up the remote
   * window for us.
-@@ -2707,6 +2831,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+@@ -2707,6 +2835,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
  	int result;
  	bool is_cwnd_limited = false, is_rwnd_limited = false;
  	u32 max_segs;
@@ -2740,31 +2861,32 @@ index e3167ad96..dcc382ae6 100644
  
  	sent_pkts = 0;
  
-@@ -2773,7 +2900,16 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+@@ -2773,7 +2904,17 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
  							  cwnd_quota,
  							  max_segs),
  						    nonagle);
 -
 +#ifdef CONFIG_SECURITY_TEMPESTA
-+		result = tcp_tfw_sk_prepare_xmit(sk, skb, mss_now, &limit,
-+						 &nskbs);
-+		if (unlikely(result)) {
-+			if (result == -ENOMEM)
-+				break; /* try again next time */
-+			tcp_tfw_handle_error(sk, result);
-+			return false;
++		if (sk->sk_write_xmit && skb_tfw_tls_type(skb)) {
++			if (unlikely(limit <= TLS_MAX_OVERHEAD)) {
++			    net_warn_ratelimited("%s: too small MSS %u"
++						 " for TLS\n",
++						 __func__, mss_now);
++				break;
++			}
++			TFW_ADJUST_TLS_OVERHEAD(limit);
 +		}
 +#endif
  		if (skb->len > limit &&
  		    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
  			break;
-@@ -2788,7 +2924,15 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+@@ -2788,7 +2929,15 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
  		 */
  		if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)
  			break;
 -
 +#ifdef CONFIG_SECURITY_TEMPESTA
-+		result = tcp_tfw_sk_write_xmit(sk, skb, mss_now, limit, nskbs);
++		result = tcp_tfw_sk_write_xmit(sk, skb, mss_now);
 +		if (unlikely(result)) {
 +			if (result == -ENOMEM)
 +				break; /* try again next time */
@@ -2775,7 +2897,7 @@ index e3167ad96..dcc382ae6 100644
  		if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
  			break;
  
-@@ -2978,6 +3122,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
+@@ -2978,6 +3127,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
  			   sk_gfp_mask(sk, GFP_ATOMIC)))
  		tcp_check_probe_timer(sk);
  }
@@ -2783,7 +2905,7 @@ index e3167ad96..dcc382ae6 100644
  
  /* Send _single_ skb sitting at the send head. This function requires
   * true push pending frames to setup probe timer etc.
-@@ -3336,7 +3481,7 @@ start:
+@@ -3336,7 +3486,7 @@ start:
  				 cur_mss, GFP_ATOMIC))
  			return -ENOMEM; /* We'll try again later. */
  	} else {
@@ -2792,7 +2914,7 @@ index e3167ad96..dcc382ae6 100644
  			return -ENOMEM;
  
  		diff = tcp_skb_pcount(skb);
-@@ -3577,6 +3722,7 @@ void tcp_send_fin(struct sock *sk)
+@@ -3577,6 +3727,7 @@ void tcp_send_fin(struct sock *sk)
  	}
  	__tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
  }
@@ -2800,7 +2922,7 @@ index e3167ad96..dcc382ae6 100644
  
  /* We get here when a process closes a file descriptor (either due to
   * an explicit close() or as a byproduct of exit()'ing) and there
-@@ -3610,6 +3756,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
+@@ -3610,6 +3761,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
  	 */
  	trace_tcp_send_reset(sk, NULL);
  }
@@ -2808,7 +2930,7 @@ index e3167ad96..dcc382ae6 100644
  
  /* Send a crossed SYN-ACK during socket establishment.
   * WARNING: This routine must only be called when we have already sent
-@@ -4292,6 +4439,9 @@ int tcp_write_wakeup(struct sock *sk, int mib)
+@@ -4292,6 +4444,9 @@ int tcp_write_wakeup(struct sock *sk, int mib)
  
  	skb = tcp_send_head(sk);
  	if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
@@ -2818,29 +2940,31 @@ index e3167ad96..dcc382ae6 100644
  		int err;
  		unsigned int mss = tcp_current_mss(sk);
  		unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
-@@ -4299,6 +4449,15 @@ int tcp_write_wakeup(struct sock *sk, int mib)
- 		if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
- 			tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
- 
-+#ifdef CONFIG_SECURITY_TEMPESTA
-+		err = tcp_tfw_sk_prepare_xmit(sk, skb, mss, &seg_size, &nskbs);
-+		if (unlikely(err)) {
-+			if (err != -ENOMEM)
-+				tcp_tfw_handle_error(sk, err);
-+			return err;
-+		}
+@@ -4306,6 +4461,17 @@ int tcp_write_wakeup(struct sock *sk, int mib)
+ 		if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
+ 		    skb->len > mss) {
+ 			seg_size = min(seg_size, mss);
++#ifdef CONFIG_SECURITY_TEMPESTA
++			if (sk->sk_write_xmit && skb_tfw_tls_type(skb)) {
++				if (unlikely(seg_size <= TLS_MAX_OVERHEAD)) {
++					net_warn_ratelimited("%s: too small"
++							     " MSS %u for TLS\n",
++							     __func__, mss);
++					return -ENOMEM;
++				}
++				TFW_ADJUST_TLS_OVERHEAD(seg_size);
++			}
 +#endif
-+
- 		/* We are probing the opening of a window
- 		 * but the window size is != 0
- 		 * must have been a result SWS avoidance ( sender )
-@@ -4314,6 +4473,16 @@ int tcp_write_wakeup(struct sock *sk, int mib)
+ 			TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
+ 			if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
+ 					 skb, seg_size, mss, GFP_ATOMIC))
+@@ -4314,6 +4480,16 @@ int tcp_write_wakeup(struct sock *sk, int mib)
  			tcp_set_skb_tso_segs(skb, mss);
  
  		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
 +
 +#ifdef CONFIG_SECURITY_TEMPESTA
-+		err = tcp_tfw_sk_write_xmit(sk, skb, mss, seg_size, nskbs);
++		err = tcp_tfw_sk_write_xmit(sk, skb, mss);
 +		if (unlikely(err)) {
 +			if (err != -ENOMEM)
 +				tcp_tfw_handle_error(sk, err);

From 094940ed95335c277fec3c13fc86792f1b2dce08 Mon Sep 17 00:00:00 2001
From: kingluo <home_king@163.com>
Date: Mon, 8 Jul 2024 14:47:37 +0800
Subject: [PATCH 20/25] Revert "enable fpu in the whole softirq ctx"

This reverts commit 89d2f30405fe151df61b2cc4d3cc58d4fef56e25.
---
 fw/apm.c              | 4 ----
 fw/cache.c            | 2 --
 fw/http_sched_ratio.c | 4 ----
 fw/sock.c             | 6 ------
 fw/sock_clnt.c        | 2 --
 fw/sock_srv.c         | 4 ----
 tls/tls_ticket.c      | 4 +---
 7 files changed, 1 insertion(+), 25 deletions(-)

diff --git a/fw/apm.c b/fw/apm.c
index 9594572e8..d68c74c29 100644
--- a/fw/apm.c
+++ b/fw/apm.c
@@ -1020,8 +1020,6 @@ tfw_apm_prcntl_tmfn(struct timer_list *t)
 	TfwApmRBuf *rbuf = &data->rbuf;
 	TfwApmRBEnt *rbent = rbuf->rbent;
 
-	kernel_fpu_begin();
-
 	/*
 	 * Increment the counter and make the updates use the other array
 	 * of the two that are available. In the meanwhile, use the array
@@ -1061,8 +1059,6 @@ tfw_apm_hm_timer_cb(struct timer_list *t)
 	TfwApmHM *hm = READ_ONCE(hmctl->hm);
 	unsigned long now;
 
-	kernel_fpu_begin();
-
 	BUG_ON(!hm);
 	if (!atomic64_read(&hmctl->rcount))
 		tfw_http_hm_srv_send(srv, hm->req, hm->reqsz);
diff --git a/fw/cache.c b/fw/cache.c
index f855dd059..ab59ca589 100644
--- a/fw/cache.c
+++ b/fw/cache.c
@@ -3158,8 +3158,6 @@ tfw_wq_tasklet(unsigned long data)
 	TfwRBQueue *wq = &ct->wq;
 	TfwCWork cw;
 
-	kernel_fpu_begin();
-
 	while (!tfw_wq_pop(wq, &cw))
 		tfw_cache_do_action(cw.msg, cw.action);
 
diff --git a/fw/http_sched_ratio.c b/fw/http_sched_ratio.c
index 06553dbc7..11495921b 100644
--- a/fw/http_sched_ratio.c
+++ b/fw/http_sched_ratio.c
@@ -680,8 +680,6 @@ tfw_sched_ratio_dynamic_tmfn(struct timer_list *t)
 {
 	TfwRatio *r = from_timer(r, t, timer);
 
-	kernel_fpu_begin();
-
 	tfw_sched_ratio_calc_tmfn(r, tfw_sched_ratio_calc_dynamic);
 }
 
@@ -693,8 +691,6 @@ tfw_sched_ratio_predict_tmfn(struct timer_list *t)
 {
 	TfwRatio *r = from_timer(r, t, timer);
 
-	kernel_fpu_begin();
-
 	tfw_sched_ratio_calc_tmfn(r, tfw_sched_ratio_calc_predict);
 }
 
diff --git a/fw/sock.c b/fw/sock.c
index 61ffd344a..7920fbf25 100644
--- a/fw/sock.c
+++ b/fw/sock.c
@@ -190,7 +190,6 @@ ss_active_guard_exit(unsigned long val)
 static void
 ss_conn_drop_guard_exit(struct sock *sk)
 {
-	kernel_fpu_begin();
 	SS_CONN_TYPE(sk) &= ~Conn_Closing;
 	SS_CALL(connection_drop, sk);
 	if (sk->sk_security)
@@ -993,8 +992,6 @@ ss_tcp_data_ready(struct sock *sk)
 	int (*action)(struct sock *sk, int flags);
 	bool was_stopped = (SS_CONN_TYPE(sk) & Conn_Stop);
 
-	kernel_fpu_begin();
-
 	T_DBG3("[%d]: %s: sk=%p state=%s\n",
 	       smp_processor_id(), __func__, sk, ss_statename[sk->sk_state]);
 	assert_spin_locked(&sk->sk_lock.slock);
@@ -1081,7 +1078,6 @@ ss_tcp_data_ready(struct sock *sk)
 static void
 ss_tcp_state_change(struct sock *sk)
 {
-	kernel_fpu_begin();
 	T_DBG3("[%d]: %s: sk=%p state=%s\n",
 	       smp_processor_id(), __func__, sk, ss_statename[sk->sk_state]);
 	ss_sk_incoming_cpu_update(sk);
@@ -1529,8 +1525,6 @@ ss_tx_action(void)
 	TfwRBQueue *wq = this_cpu_ptr(&si_wq);
 	long ticket = 0;
 
-	kernel_fpu_begin();
-
 	/*
 	 * @budget limits the loop to prevent live lock on constantly arriving
 	 * new items. We use some small integer as a lower bound to catch just
diff --git a/fw/sock_clnt.c b/fw/sock_clnt.c
index d13d520a7..40ff4c093 100644
--- a/fw/sock_clnt.c
+++ b/fw/sock_clnt.c
@@ -71,8 +71,6 @@ tfw_sock_cli_keepalive_timer_cb(struct timer_list *t)
 {
 	TfwCliConn *cli_conn = from_timer(cli_conn, t, timer);
 
-	kernel_fpu_begin();
-
 	T_DBG("Client timeout end\n");
 
 	/*
diff --git a/fw/sock_srv.c b/fw/sock_srv.c
index 85a44e8ac..f6c3a9df2 100644
--- a/fw/sock_srv.c
+++ b/fw/sock_srv.c
@@ -300,8 +300,6 @@ tfw_sock_srv_connect_retry_timer_cb(struct timer_list *t)
 {
 	TfwSrvConn *srv_conn = from_timer(srv_conn, t, timer);
 
-	kernel_fpu_begin();
-
 	/* A new socket is created for each connect attempt. */
 	tfw_sock_srv_connect_try(srv_conn);
 }
@@ -818,8 +816,6 @@ tfw_sock_srv_grace_shutdown_cb(struct timer_list *t)
 {
 	TfwServer *srv = from_timer(srv, t, gs_timer);
 
-	kernel_fpu_begin();
-
 	tfw_sock_srv_grace_stop(srv);
 }
 
diff --git a/tls/tls_ticket.c b/tls/tls_ticket.c
index 07c55c82b..b218e0d1e 100644
--- a/tls/tls_ticket.c
+++ b/tls/tls_ticket.c
@@ -6,7 +6,7 @@
  * Based on mbed TLS, https://tls.mbed.org.
  *
  * Copyright (C) 2006-2015, ARM Limited, All Rights Reserved
- * Copyright (C) 2015-2021 Tempesta Technologies, Inc.
+ * Copyright (C) 2015-2024 Tempesta Technologies, Inc.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -179,8 +179,6 @@ ttls_ticket_rotate_keys(struct timer_list *t)
 	TlsTicketPeerCfg *tcfg = from_timer(tcfg, t, timer);
 	unsigned long secs;
 
-	kernel_fpu_begin();
-
 	T_DBG("TLS: Rotate keys for ticket configuration [%pK]\n", tcfg);
 	if (ttls_ticket_update_keys(tcfg))
 		T_ERR("TLS: Can't rotate keys for ticket configuration [%pK]\n",

From eb1e9ec75b006dbb2eeef6c84e5ee6d66a17825f Mon Sep 17 00:00:00 2001
From: kingluo <home_king@163.com>
Date: Mon, 8 Jul 2024 15:41:02 +0800
Subject: [PATCH 21/25] try endbr64 on each switch label

---
 fw/str_avx2.S       | 35 +++++++++++++++++++++++++++++++----
 tls/bignum_x86-64.S | 10 +++++++---
 2 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/fw/str_avx2.S b/fw/str_avx2.S
index 02e3e0f2a..daf6629e9 100644
--- a/fw/str_avx2.S
+++ b/fw/str_avx2.S
@@ -5,7 +5,7 @@
  * description and performance comparison with other implementations at
  * http://natsys-lab.blogspot.ru/2016/10/http-strings-processing-using-c-sse42.html
  *
- * Copyright (C) 2016-2023 Tempesta Technologies, Inc.
+ * Copyright (C) 2016-2024 Tempesta Technologies, Inc.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by
@@ -311,7 +311,7 @@ SYM_FUNC_START(__tfw_strtolower_avx2)
 	 * a constant and there is no speculation required for the attack.
 	 */
 	ANNOTATE_RETPOLINE_SAFE
-	notrack jmpq	*%rax
+	jmpq	*%rax
 .section .rodata
 .align	8
 .str2low_switch:
@@ -327,22 +327,31 @@ SYM_FUNC_START(__tfw_strtolower_avx2)
 
 .text
 .str2low_len8:
+	endbr64;
 	__STRTOLOWER_SMALL_STR 7
 .str2low_len7:
+	endbr64;
 	__STRTOLOWER_SMALL_STR 6
 .str2low_len6:
+	endbr64;
 	__STRTOLOWER_SMALL_STR 5
 .str2low_len5:
+	endbr64;
 	__STRTOLOWER_SMALL_STR 4
 .str2low_len4:
+	endbr64;
 	__STRTOLOWER_SMALL_STR 3
 .str2low_len3:
+	endbr64;
 	__STRTOLOWER_SMALL_STR 2
 .str2low_len2:
+	endbr64;
 	__STRTOLOWER_SMALL_STR 1
 .str2low_len1:
+	endbr64;
 	__STRTOLOWER_SMALL_STR 0
 .str2low_len0:
+	endbr64;
 	addq	$88, %rsp
 	popq	%rbx
 	popq	%r10
@@ -481,7 +490,7 @@ SYM_FUNC_START(__tfw_stricmp_avx2)
 	/* Process short strings below 8 bytes in length. */
 	movq	.stricmp_switch(,%rdx,8), %rax
 	ANNOTATE_RETPOLINE_SAFE /* constant bounds check */
-	notrack jmpq	*%rax
+	jmpq	*%rax
 .section .rodata
 .align 8
 .stricmp_switch:
@@ -496,6 +505,7 @@ SYM_FUNC_START(__tfw_stricmp_avx2)
 	.quad	.stricmp_len8
 .text
 .stricmp_len7:
+	endbr64;
 	xorl	%eax, %eax
 	/*
 	 * The blocks at the below use complex mixture for the registers,
@@ -552,27 +562,35 @@ SYM_FUNC_START(__tfw_stricmp_avx2)
 	orl	%edx, %eax
 	RET
 .stricmp_len0:
+	endbr64;
 	xorl	%eax, %eax
 	RET
 .stricmp_len1:
+	endbr64;
 	xorl	%edx, %edx
 	jmp	.stricmp_do_len1
 .stricmp_len2:
+	endbr64;
 	xorl	%edx, %edx
 	jmp	.stricmp_do_len2
 .stricmp_len3:
+	endbr64;
 	xorl	%eax, %eax
 	jmp	.stricmp_do_len3
 .stricmp_len4:
+	endbr64;
 	xorl	%eax, %eax
 	jmp	.stricmp_do_len4
 .stricmp_len5:
+	endbr64;
 	xorl	%edx, %edx
 	jmp	.stricmp_do_len5
 .stricmp_len6:
+	endbr64;
 	xorl	%edx, %edx
 	jmp	.stricmp_do_len6
 .stricmp_len8:
+	endbr64;
 	movzbl	7(%rdi), %edx
 	movzbl	7(%rsi), %eax
 	movzbl	__tfw_lct(%rdx), %edx
@@ -838,7 +856,7 @@ SYM_FUNC_START(__tfw_stricmp_avx2_2lc)
 
 	movq	.sic2lc_switch(,%rdx,8), %rax
 	ANNOTATE_RETPOLINE_SAFE /* constant bounds check */
-	notrack jmpq	*%rax
+	jmpq	*%rax
 .section	.rodata
 .align 8
 .sic2lc_switch:
@@ -853,6 +871,7 @@ SYM_FUNC_START(__tfw_stricmp_avx2_2lc)
 	.quad	.sic2lc_len8
 .text
 .sic2lc_len7:
+	endbr64;
 	xorl	%eax, %eax
 .sic2lc_do_len7:
 	movzbl	6(%rdi), %edx
@@ -900,27 +919,35 @@ SYM_FUNC_START(__tfw_stricmp_avx2_2lc)
 	orl	%edx, %eax
 	RET
 .sic2lc_len0:
+	endbr64;
 	xorl	%eax, %eax
 	RET
 .sic2lc_len1:
+	endbr64;
 	xorl	%edx, %edx
 	jmp	.sic2lc_do_len1
 .sic2lc_len2:
+	endbr64;
 	xorl	%edx, %edx
 	jmp	.sic2lc_do_len2
 .sic2lc_len3:
+	endbr64;
 	xorl	%eax, %eax
 	jmp	.sic2lc_do_len3
 .sic2lc_len4:
+	endbr64;
 	xorl	%eax, %eax
 	jmp	.sic2lc_do_len4
 .sic2lc_len5:
+	endbr64;
 	xorl	%edx, %edx
 	jmp	.sic2lc_do_len5
 .sic2lc_len6:
+	endbr64;
 	xorl	%edx, %edx
 	jmp	.sic2lc_do_len6
 .sic2lc_len8:
+	endbr64;
 	movzbl	7(%rdi), %eax
 	movzbl	__tfw_lct(%rax), %edx
 	xorb	7(%rsi), %dl
diff --git a/tls/bignum_x86-64.S b/tls/bignum_x86-64.S
index 29b470647..f714b4eae 100644
--- a/tls/bignum_x86-64.S
+++ b/tls/bignum_x86-64.S
@@ -1,7 +1,7 @@
 /**
  *		Tempesta FW
  *
- * Copyright (C) 2020-2021 Tempesta Technologies, Inc.
+ * Copyright (C) 2020-2024 Tempesta Technologies, Inc.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by
@@ -192,28 +192,32 @@ SYM_FUNC_START(mpi_sub_x86_64)
 	loop	.sub_by_4
 	popq	%r12
 	ANNOTATE_RETPOLINE_SAFE
-	notrack jmpq	*%rbx
+	jmpq	*%rbx
 .sub_small_b:
 	clc
 	ANNOTATE_RETPOLINE_SAFE
-	notrack jmpq	*%rbx
+	jmpq	*%rbx
 
 .sub_tail3:
+	endbr64;
 	movq	(%rdx, %rax, 8), %r9
 	sbbq	(%rsi, %rax, 8), %r9
 	movq	%r9, (%rdi, %rax, 8)
 	incq	%rax
 .sub_tail2:
+	endbr64;
 	movq	(%rdx, %rax, 8), %r10
 	sbbq	(%rsi, %rax, 8), %r10
 	movq	%r10, (%rdi, %rax, 8)
 	incq	%rax
 .sub_tail1:
+	endbr64;
 	movq	(%rdx, %rax, 8), %r11
 	sbbq	(%rsi, %rax, 8), %r11
 	movq	%r11, (%rdi, %rax, 8)
 	incq	%rax
 .sub_tail0:
+	endbr64;
 	popq	%rbx
 
 	/*

From 2406021a8b7856c11874ae58cb507b1253ab9b4a Mon Sep 17 00:00:00 2001
From: kingluo <home_king@163.com>
Date: Mon, 8 Jul 2024 19:05:43 +0800
Subject: [PATCH 22/25] remove notrack, prefix endbr64 at each jump table entry

---
 fw/str_avx2.S       | 54 ++++++++++++++++++++++-----------------------
 tls/bignum_x86-64.S |  8 +++----
 2 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/fw/str_avx2.S b/fw/str_avx2.S
index daf6629e9..e7d5138c8 100644
--- a/fw/str_avx2.S
+++ b/fw/str_avx2.S
@@ -327,31 +327,31 @@ SYM_FUNC_START(__tfw_strtolower_avx2)
 
 .text
 .str2low_len8:
-	endbr64;
+	endbr64
 	__STRTOLOWER_SMALL_STR 7
 .str2low_len7:
-	endbr64;
+	endbr64
 	__STRTOLOWER_SMALL_STR 6
 .str2low_len6:
-	endbr64;
+	endbr64
 	__STRTOLOWER_SMALL_STR 5
 .str2low_len5:
-	endbr64;
+	endbr64
 	__STRTOLOWER_SMALL_STR 4
 .str2low_len4:
-	endbr64;
+	endbr64
 	__STRTOLOWER_SMALL_STR 3
 .str2low_len3:
-	endbr64;
+	endbr64
 	__STRTOLOWER_SMALL_STR 2
 .str2low_len2:
-	endbr64;
+	endbr64
 	__STRTOLOWER_SMALL_STR 1
 .str2low_len1:
-	endbr64;
+	endbr64
 	__STRTOLOWER_SMALL_STR 0
 .str2low_len0:
-	endbr64;
+	endbr64
 	addq	$88, %rsp
 	popq	%rbx
 	popq	%r10
@@ -505,7 +505,7 @@ SYM_FUNC_START(__tfw_stricmp_avx2)
 	.quad	.stricmp_len8
 .text
 .stricmp_len7:
-	endbr64;
+	endbr64
 	xorl	%eax, %eax
 	/*
 	 * The blocks at the below use complex mixture for the registers,
@@ -562,35 +562,35 @@ SYM_FUNC_START(__tfw_stricmp_avx2)
 	orl	%edx, %eax
 	RET
 .stricmp_len0:
-	endbr64;
+	endbr64
 	xorl	%eax, %eax
 	RET
 .stricmp_len1:
-	endbr64;
+	endbr64
 	xorl	%edx, %edx
 	jmp	.stricmp_do_len1
 .stricmp_len2:
-	endbr64;
+	endbr64
 	xorl	%edx, %edx
 	jmp	.stricmp_do_len2
 .stricmp_len3:
-	endbr64;
+	endbr64
 	xorl	%eax, %eax
 	jmp	.stricmp_do_len3
 .stricmp_len4:
-	endbr64;
+	endbr64
 	xorl	%eax, %eax
 	jmp	.stricmp_do_len4
 .stricmp_len5:
-	endbr64;
+	endbr64
 	xorl	%edx, %edx
 	jmp	.stricmp_do_len5
 .stricmp_len6:
-	endbr64;
+	endbr64
 	xorl	%edx, %edx
 	jmp	.stricmp_do_len6
 .stricmp_len8:
-	endbr64;
+	endbr64
 	movzbl	7(%rdi), %edx
 	movzbl	7(%rsi), %eax
 	movzbl	__tfw_lct(%rdx), %edx
@@ -871,7 +871,7 @@ SYM_FUNC_START(__tfw_stricmp_avx2_2lc)
 	.quad	.sic2lc_len8
 .text
 .sic2lc_len7:
-	endbr64;
+	endbr64
 	xorl	%eax, %eax
 .sic2lc_do_len7:
 	movzbl	6(%rdi), %edx
@@ -919,35 +919,35 @@ SYM_FUNC_START(__tfw_stricmp_avx2_2lc)
 	orl	%edx, %eax
 	RET
 .sic2lc_len0:
-	endbr64;
+	endbr64
 	xorl	%eax, %eax
 	RET
 .sic2lc_len1:
-	endbr64;
+	endbr64
 	xorl	%edx, %edx
 	jmp	.sic2lc_do_len1
 .sic2lc_len2:
-	endbr64;
+	endbr64
 	xorl	%edx, %edx
 	jmp	.sic2lc_do_len2
 .sic2lc_len3:
-	endbr64;
+	endbr64
 	xorl	%eax, %eax
 	jmp	.sic2lc_do_len3
 .sic2lc_len4:
-	endbr64;
+	endbr64
 	xorl	%eax, %eax
 	jmp	.sic2lc_do_len4
 .sic2lc_len5:
-	endbr64;
+	endbr64
 	xorl	%edx, %edx
 	jmp	.sic2lc_do_len5
 .sic2lc_len6:
-	endbr64;
+	endbr64
 	xorl	%edx, %edx
 	jmp	.sic2lc_do_len6
 .sic2lc_len8:
-	endbr64;
+	endbr64
 	movzbl	7(%rdi), %eax
 	movzbl	__tfw_lct(%rax), %edx
 	xorb	7(%rsi), %dl
diff --git a/tls/bignum_x86-64.S b/tls/bignum_x86-64.S
index f714b4eae..d0303e804 100644
--- a/tls/bignum_x86-64.S
+++ b/tls/bignum_x86-64.S
@@ -199,25 +199,25 @@ SYM_FUNC_START(mpi_sub_x86_64)
 	jmpq	*%rbx
 
 .sub_tail3:
-	endbr64;
+	endbr64
 	movq	(%rdx, %rax, 8), %r9
 	sbbq	(%rsi, %rax, 8), %r9
 	movq	%r9, (%rdi, %rax, 8)
 	incq	%rax
 .sub_tail2:
-	endbr64;
+	endbr64
 	movq	(%rdx, %rax, 8), %r10
 	sbbq	(%rsi, %rax, 8), %r10
 	movq	%r10, (%rdi, %rax, 8)
 	incq	%rax
 .sub_tail1:
-	endbr64;
+	endbr64
 	movq	(%rdx, %rax, 8), %r11
 	sbbq	(%rsi, %rax, 8), %r11
 	movq	%r11, (%rdi, %rax, 8)
 	incq	%rax
 .sub_tail0:
-	endbr64;
+	endbr64
 	popq	%rbx
 
 	/*

From b73a31af9ace00244d344741171c7d54d3014804 Mon Sep 17 00:00:00 2001
From: kingluo <home_king@163.com>
Date: Tue, 9 Jul 2024 17:01:45 +0800
Subject: [PATCH 23/25] use struct_group to avoid __write_overflow_field

---
 fw/apm.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/fw/apm.c b/fw/apm.c
index d68c74c29..fef1a2eba 100644
--- a/fw/apm.c
+++ b/fw/apm.c
@@ -100,13 +100,13 @@ typedef struct {
 
 typedef struct {
 	TfwPcntCtl	ctl[TFW_STATS_RANGES];
-	char		__reset_from[0];
-	unsigned long	tot_cnt;
-	unsigned long	tot_val;
-	unsigned int	min_val;
-	unsigned int	max_val;
-	unsigned long	cnt[TFW_STATS_RANGES][TFW_STATS_BCKTS];
-	char		__reset_till[0];
+	struct_group(reset,
+		unsigned long	tot_cnt;
+		unsigned long	tot_val;
+		unsigned int	min_val;
+		unsigned int	max_val;
+		unsigned long	cnt[TFW_STATS_RANGES][TFW_STATS_BCKTS];
+	);
 } TfwPcntRanges __attribute__((aligned(L1_CACHE_BYTES)));
 
 static inline unsigned long *
@@ -840,9 +840,7 @@ tfw_apm_prnctl_calc(TfwApmRBuf *rbuf, TfwApmRBCtl *rbctl, TfwPrcntlStats *pstats
 static inline void
 __tfw_apm_rbent_reset(TfwApmRBEnt *crbent, unsigned long jtmistamp)
 {
-	memset(crbent->pcntrng.__reset_from, 0,
-	       offsetof(TfwPcntRanges, __reset_till)
-	       - offsetof(TfwPcntRanges, __reset_from));
+	memset(&crbent->pcntrng.reset, 0, sizeof(crbent->pcntrng.reset));
 	crbent->pcntrng.min_val = UINT_MAX;
 	crbent->jtmistamp = jtmistamp;
 	smp_mb__before_atomic();
@@ -1020,6 +1018,8 @@ tfw_apm_prcntl_tmfn(struct timer_list *t)
 	TfwApmRBuf *rbuf = &data->rbuf;
 	TfwApmRBEnt *rbent = rbuf->rbent;
 
+	kernel_fpu_begin();
+
 	/*
 	 * Increment the counter and make the updates use the other array
 	 * of the two that are available. In the meanwhile, use the array
@@ -1059,6 +1059,8 @@ tfw_apm_hm_timer_cb(struct timer_list *t)
 	TfwApmHM *hm = READ_ONCE(hmctl->hm);
 	unsigned long now;
 
+	kernel_fpu_begin();
+
 	BUG_ON(!hm);
 	if (!atomic64_read(&hmctl->rcount))
 		tfw_http_hm_srv_send(srv, hm->req, hm->reqsz);

From 09bd329d76bf16b11ada8a033b585e28f04d0d3d Mon Sep 17 00:00:00 2001
From: kingluo <home_king@163.com>
Date: Sun, 14 Jul 2024 23:48:11 +0800
Subject: [PATCH 24/25] Remove `kernel_fpu_begin()` added by mistake

---
 fw/apm.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fw/apm.c b/fw/apm.c
index fef1a2eba..d77f97c33 100644
--- a/fw/apm.c
+++ b/fw/apm.c
@@ -1018,8 +1018,6 @@ tfw_apm_prcntl_tmfn(struct timer_list *t)
 	TfwApmRBuf *rbuf = &data->rbuf;
 	TfwApmRBEnt *rbent = rbuf->rbent;
 
-	kernel_fpu_begin();
-
 	/*
 	 * Increment the counter and make the updates use the other array
 	 * of the two that are available. In the meanwhile, use the array
@@ -1059,8 +1057,6 @@ tfw_apm_hm_timer_cb(struct timer_list *t)
 	TfwApmHM *hm = READ_ONCE(hmctl->hm);
 	unsigned long now;
 
-	kernel_fpu_begin();
-
 	BUG_ON(!hm);
 	if (!atomic64_read(&hmctl->rcount))
 		tfw_http_hm_srv_send(srv, hm->req, hm->reqsz);

From 09f70b3ca87a47b2481ef450fd6e4b6969bc71d5 Mon Sep 17 00:00:00 2001
From: supervisor <supervisor@localhost.localdomain>
Date: Fri, 5 Jul 2024 17:56:34 +0400
Subject: [PATCH 25/25] Implemented regex for locations and httptables

Made regex configuration same way as in Nginx.
---
 Makefile                               |    2 +-
 fw/cfg.c                               |   97 +-
 fw/cfg.h                               |   10 +
 fw/http_match.c                        |  125 +-
 fw/http_match.h                        |   10 +-
 fw/http_tbl.c                          |    6 +-
 fw/str.h                               |    2 +
 fw/t/unit/test_http_match.c            |    3 +-
 fw/vhost.c                             |   52 +-
 install.txt                            |   42 +
 regex/Makefile                         |   82 +
 regex/alloc.c                          |  135 +
 regex/allocator.h                      |   66 +
 regex/build.sh                         |   17 +
 regex/crc32.h                          |   58 +
 regex/database.c                       |  474 ++++
 regex/database.h                       |  142 +
 regex/dkms.conf                        |    8 +
 regex/fdr/fdr.c                        |  881 ++++++
 regex/fdr/fdr.h                        |   85 +
 regex/fdr/fdr_confirm.h                |   94 +
 regex/fdr/fdr_confirm_runtime.h        |  104 +
 regex/fdr/fdr_internal.h               |  105 +
 regex/fdr/fdr_loadval.h                |   71 +
 regex/fdr/flood_runtime.h              |  337 +++
 regex/fdr/teddy.c                      | 1114 ++++++++
 regex/fdr/teddy.h                      |  110 +
 regex/fdr/teddy_avx2.c                 |  709 +++++
 regex/fdr/teddy_internal.h             |   66 +
 regex/fdr/teddy_runtime_common.h       |  459 ++++
 regex/hs.h                             |   51 +
 regex/hs_common.h                      |  600 ++++
 regex/hs_compile.h                     | 1224 +++++++++
 regex/hs_internal.h                    |   89 +
 regex/hs_runtime.h                     |  683 +++++
 regex/hs_version.c                     |   36 +
 regex/hwlm/hwlm.c                      |  247 ++
 regex/hwlm/hwlm.h                      |  145 +
 regex/hwlm/hwlm_internal.h             |   62 +
 regex/hwlm/noodle_engine.c             |  447 +++
 regex/hwlm/noodle_engine.h             |   60 +
 regex/hwlm/noodle_engine_avx2.c        |  244 ++
 regex/hwlm/noodle_engine_avx512.c      |  191 ++
 regex/hwlm/noodle_engine_sse.c         |  203 ++
 regex/hwlm/noodle_internal.h           |   51 +
 regex/kmod/.clang-format               |  683 +++++
 regex/kmod/config.h                    |  109 +
 regex/kmod/hs_version.h                |   39 +
 regex/kmod/rex.c                       |  649 +++++
 regex/kmod/rex.h                       |   82 +
 regex/kmod/rex_trace.h                 |   37 +
 regex/kmod/ue2common_kern.h            |  106 +
 regex/nfa/accel.c                      |  146 +
 regex/nfa/accel.h                      |  128 +
 regex/nfa/callback.h                   |   72 +
 regex/nfa/castle.c                     | 1149 ++++++++
 regex/nfa/castle.h                     |   65 +
 regex/nfa/castle_internal.h            |  143 +
 regex/nfa/gough.c                      | 1147 ++++++++
 regex/nfa/gough.h                      |   82 +
 regex/nfa/gough_internal.h             |  134 +
 regex/nfa/lbr.c                        |  531 ++++
 regex/nfa/lbr.h                        |  150 +
 regex/nfa/lbr_common_impl.h            |  462 ++++
 regex/nfa/lbr_internal.h               |   82 +
 regex/nfa/limex.h                      |   91 +
 regex/nfa/limex_64.c                   |   73 +
 regex/nfa/limex_accel.c                |  170 ++
 regex/nfa/limex_accel.h                |   79 +
 regex/nfa/limex_common_impl.h          |  431 +++
 regex/nfa/limex_context.h              |   91 +
 regex/nfa/limex_exceptional.h          |  401 +++
 regex/nfa/limex_internal.h             |  203 ++
 regex/nfa/limex_limits.h               |   35 +
 regex/nfa/limex_native.c               |  129 +
 regex/nfa/limex_ring.h                 |  106 +
 regex/nfa/limex_runtime.h              |  201 ++
 regex/nfa/limex_runtime_impl.h         | 1079 ++++++++
 regex/nfa/limex_shuffle.h              |   78 +
 regex/nfa/limex_simd128.c              |   63 +
 regex/nfa/limex_simd256.c              |   60 +
 regex/nfa/limex_simd384.c              |   60 +
 regex/nfa/limex_simd512.c              |   60 +
 regex/nfa/limex_state_impl.h           |  145 +
 regex/nfa/mcclellan.c                  | 1350 +++++++++
 regex/nfa/mcclellan.h                  |  109 +
 regex/nfa/mcclellan_common_impl.h      |  189 ++
 regex/nfa/mcclellan_internal.h         |  164 ++
 regex/nfa/mcsheng.c                    | 2742 ++++++++++++++++++
 regex/nfa/mcsheng.h                    |  157 ++
 regex/nfa/mcsheng_data.c               |   55 +
 regex/nfa/mcsheng_internal.h           |  124 +
 regex/nfa/mpv.c                        | 1100 ++++++++
 regex/nfa/mpv.h                        |   60 +
 regex/nfa/mpv_internal.h               |  197 ++
 regex/nfa/nfa_api.h                    |  280 ++
 regex/nfa/nfa_api_dispatch.c           |  368 +++
 regex/nfa/nfa_api_queue.h              |  289 ++
 regex/nfa/nfa_api_util.h               |   82 +
 regex/nfa/nfa_internal.h               |  266 ++
 regex/nfa/nfa_rev_api.h                |  157 ++
 regex/nfa/repeat.c                     | 1611 +++++++++++
 regex/nfa/repeat.h                     |  370 +++
 regex/nfa/repeat_internal.h            |  218 ++
 regex/nfa/sheng.c                      | 1877 +++++++++++++
 regex/nfa/sheng.h                      |  143 +
 regex/nfa/sheng_defs.h                 |  754 +++++
 regex/nfa/sheng_impl.h                 |  221 ++
 regex/nfa/sheng_impl4.h                |  711 +++++
 regex/nfa/sheng_internal.h             |  107 +
 regex/nfa/shufti.c                     | 1097 ++++++++
 regex/nfa/shufti.h                     |   61 +
 regex/nfa/tamarama.c                   |  441 +++
 regex/nfa/tamarama.h                   |   70 +
 regex/nfa/tamarama_internal.h          |  105 +
 regex/nfa/truffle.c                    |  608 ++++
 regex/nfa/truffle.h                    |   57 +
 regex/nfa/vermicelli.h                 |  518 ++++
 regex/nfa/vermicelli_run.h             |   90 +
 regex/nfa/vermicelli_sse.h             |  889 ++++++
 regex/report.h                         |  392 +++
 regex/rose/block.c                     |  422 +++
 regex/rose/catchup.c                   |  900 ++++++
 regex/rose/catchup.h                   |  207 ++
 regex/rose/counting_miracle.h          |  263 ++
 regex/rose/infix.h                     |  161 ++
 regex/rose/init.c                      |   92 +
 regex/rose/init.h                      |   46 +
 regex/rose/match.c                     |  632 +++++
 regex/rose/match.h                     |  383 +++
 regex/rose/miracle.h                   |  138 +
 regex/rose/program_runtime.c           | 3509 ++++++++++++++++++++++++
 regex/rose/program_runtime.h           |   61 +
 regex/rose/rose.h                      |   62 +
 regex/rose/rose_common.h               |   56 +
 regex/rose/rose_internal.h             |  659 +++++
 regex/rose/rose_program.h              |  724 +++++
 regex/rose/rose_types.h                |   71 +
 regex/rose/runtime.h                   |  160 ++
 regex/rose/stream.c                    |  752 +++++
 regex/rose/stream_long_lit.h           |  372 +++
 regex/rose/stream_long_lit_hash.h      |  105 +
 regex/rose/validate_mask.h             |  154 ++
 regex/rose/validate_shufti.h           |  372 +++
 regex/runtime.c                        | 1356 +++++++++
 regex/scratch.c                        |  466 ++++
 regex/scratch.h                        |  276 ++
 regex/smallwrite/smallwrite_internal.h |   53 +
 regex/som/som_operation.h              |   84 +
 regex/som/som_runtime.c                |  535 ++++
 regex/som/som_runtime.h                |   67 +
 regex/som/som_stream.c                 |  174 ++
 regex/som/som_stream.h                 |   48 +
 regex/state.h                          |   69 +
 regex/stream_compress.c                |  134 +
 regex/stream_compress.h                |   55 +
 regex/stream_compress_impl.h           |  193 ++
 regex/ue2common.h                      |  247 ++
 regex/util/arch.h                      |   92 +
 regex/util/bitutils.h                  |  492 ++++
 regex/util/compare.h                   |  183 ++
 regex/util/copybytes.h                 |  113 +
 regex/util/cpuid_flags.c               |  176 ++
 regex/util/cpuid_flags.h               |   55 +
 regex/util/cpuid_inline.h              |  260 ++
 regex/util/exhaust.h                   |   41 +
 regex/util/fatbit.h                    |   93 +
 regex/util/intrinsics.h                |   69 +
 regex/util/join.h                      |   40 +
 regex/util/logical.h                   |   77 +
 regex/util/masked_move.c               |   91 +
 regex/util/masked_move.h               |   82 +
 regex/util/multibit.c                  |  140 +
 regex/util/multibit.h                  | 1506 ++++++++++
 regex/util/multibit_compress.h         |  204 ++
 regex/util/multibit_internal.h         |   81 +
 regex/util/pack_bits.h                 |  227 ++
 regex/util/partial_store.h             |  163 ++
 regex/util/popcount.h                  |   74 +
 regex/util/pqueue.h                    |  109 +
 regex/util/scatter.h                   |   55 +
 regex/util/scatter_runtime.h           |   74 +
 regex/util/simd_types.h                |   57 +
 regex/util/simd_utils.c                |   62 +
 regex/util/simd_utils.h                | 1424 ++++++++++
 regex/util/state_compress.c            |  617 +++++
 regex/util/state_compress.h            |   68 +
 regex/util/unaligned.h                 |   98 +
 regex/util/uniform_ops.h               |  243 ++
 scripts/install_regex.sh               |   56 +
 scripts/regex_start.sh                 |   28 +
 scripts/regex_stop.sh                  |    8 +
 scripts/tempesta.sh                    |   11 +
 193 files changed, 58582 insertions(+), 24 deletions(-)
 create mode 100644 install.txt
 create mode 100644 regex/Makefile
 create mode 100644 regex/alloc.c
 create mode 100644 regex/allocator.h
 create mode 100755 regex/build.sh
 create mode 100644 regex/crc32.h
 create mode 100644 regex/database.c
 create mode 100644 regex/database.h
 create mode 100644 regex/dkms.conf
 create mode 100644 regex/fdr/fdr.c
 create mode 100644 regex/fdr/fdr.h
 create mode 100644 regex/fdr/fdr_confirm.h
 create mode 100644 regex/fdr/fdr_confirm_runtime.h
 create mode 100644 regex/fdr/fdr_internal.h
 create mode 100644 regex/fdr/fdr_loadval.h
 create mode 100644 regex/fdr/flood_runtime.h
 create mode 100644 regex/fdr/teddy.c
 create mode 100644 regex/fdr/teddy.h
 create mode 100644 regex/fdr/teddy_avx2.c
 create mode 100644 regex/fdr/teddy_internal.h
 create mode 100644 regex/fdr/teddy_runtime_common.h
 create mode 100644 regex/hs.h
 create mode 100644 regex/hs_common.h
 create mode 100644 regex/hs_compile.h
 create mode 100644 regex/hs_internal.h
 create mode 100644 regex/hs_runtime.h
 create mode 100644 regex/hs_version.c
 create mode 100644 regex/hwlm/hwlm.c
 create mode 100644 regex/hwlm/hwlm.h
 create mode 100644 regex/hwlm/hwlm_internal.h
 create mode 100644 regex/hwlm/noodle_engine.c
 create mode 100644 regex/hwlm/noodle_engine.h
 create mode 100644 regex/hwlm/noodle_engine_avx2.c
 create mode 100644 regex/hwlm/noodle_engine_avx512.c
 create mode 100644 regex/hwlm/noodle_engine_sse.c
 create mode 100644 regex/hwlm/noodle_internal.h
 create mode 100644 regex/kmod/.clang-format
 create mode 100644 regex/kmod/config.h
 create mode 100644 regex/kmod/hs_version.h
 create mode 100644 regex/kmod/rex.c
 create mode 100644 regex/kmod/rex.h
 create mode 100644 regex/kmod/rex_trace.h
 create mode 100644 regex/kmod/ue2common_kern.h
 create mode 100644 regex/nfa/accel.c
 create mode 100644 regex/nfa/accel.h
 create mode 100644 regex/nfa/callback.h
 create mode 100644 regex/nfa/castle.c
 create mode 100644 regex/nfa/castle.h
 create mode 100644 regex/nfa/castle_internal.h
 create mode 100644 regex/nfa/gough.c
 create mode 100644 regex/nfa/gough.h
 create mode 100644 regex/nfa/gough_internal.h
 create mode 100644 regex/nfa/lbr.c
 create mode 100644 regex/nfa/lbr.h
 create mode 100644 regex/nfa/lbr_common_impl.h
 create mode 100644 regex/nfa/lbr_internal.h
 create mode 100644 regex/nfa/limex.h
 create mode 100644 regex/nfa/limex_64.c
 create mode 100644 regex/nfa/limex_accel.c
 create mode 100644 regex/nfa/limex_accel.h
 create mode 100644 regex/nfa/limex_common_impl.h
 create mode 100644 regex/nfa/limex_context.h
 create mode 100644 regex/nfa/limex_exceptional.h
 create mode 100644 regex/nfa/limex_internal.h
 create mode 100644 regex/nfa/limex_limits.h
 create mode 100644 regex/nfa/limex_native.c
 create mode 100644 regex/nfa/limex_ring.h
 create mode 100644 regex/nfa/limex_runtime.h
 create mode 100644 regex/nfa/limex_runtime_impl.h
 create mode 100644 regex/nfa/limex_shuffle.h
 create mode 100644 regex/nfa/limex_simd128.c
 create mode 100644 regex/nfa/limex_simd256.c
 create mode 100644 regex/nfa/limex_simd384.c
 create mode 100644 regex/nfa/limex_simd512.c
 create mode 100644 regex/nfa/limex_state_impl.h
 create mode 100644 regex/nfa/mcclellan.c
 create mode 100644 regex/nfa/mcclellan.h
 create mode 100644 regex/nfa/mcclellan_common_impl.h
 create mode 100644 regex/nfa/mcclellan_internal.h
 create mode 100644 regex/nfa/mcsheng.c
 create mode 100644 regex/nfa/mcsheng.h
 create mode 100644 regex/nfa/mcsheng_data.c
 create mode 100644 regex/nfa/mcsheng_internal.h
 create mode 100644 regex/nfa/mpv.c
 create mode 100644 regex/nfa/mpv.h
 create mode 100644 regex/nfa/mpv_internal.h
 create mode 100644 regex/nfa/nfa_api.h
 create mode 100644 regex/nfa/nfa_api_dispatch.c
 create mode 100644 regex/nfa/nfa_api_queue.h
 create mode 100644 regex/nfa/nfa_api_util.h
 create mode 100644 regex/nfa/nfa_internal.h
 create mode 100644 regex/nfa/nfa_rev_api.h
 create mode 100644 regex/nfa/repeat.c
 create mode 100644 regex/nfa/repeat.h
 create mode 100644 regex/nfa/repeat_internal.h
 create mode 100644 regex/nfa/sheng.c
 create mode 100644 regex/nfa/sheng.h
 create mode 100644 regex/nfa/sheng_defs.h
 create mode 100644 regex/nfa/sheng_impl.h
 create mode 100644 regex/nfa/sheng_impl4.h
 create mode 100644 regex/nfa/sheng_internal.h
 create mode 100644 regex/nfa/shufti.c
 create mode 100644 regex/nfa/shufti.h
 create mode 100644 regex/nfa/tamarama.c
 create mode 100644 regex/nfa/tamarama.h
 create mode 100644 regex/nfa/tamarama_internal.h
 create mode 100644 regex/nfa/truffle.c
 create mode 100644 regex/nfa/truffle.h
 create mode 100644 regex/nfa/vermicelli.h
 create mode 100644 regex/nfa/vermicelli_run.h
 create mode 100644 regex/nfa/vermicelli_sse.h
 create mode 100644 regex/report.h
 create mode 100644 regex/rose/block.c
 create mode 100644 regex/rose/catchup.c
 create mode 100644 regex/rose/catchup.h
 create mode 100644 regex/rose/counting_miracle.h
 create mode 100644 regex/rose/infix.h
 create mode 100644 regex/rose/init.c
 create mode 100644 regex/rose/init.h
 create mode 100644 regex/rose/match.c
 create mode 100644 regex/rose/match.h
 create mode 100644 regex/rose/miracle.h
 create mode 100644 regex/rose/program_runtime.c
 create mode 100644 regex/rose/program_runtime.h
 create mode 100644 regex/rose/rose.h
 create mode 100644 regex/rose/rose_common.h
 create mode 100644 regex/rose/rose_internal.h
 create mode 100644 regex/rose/rose_program.h
 create mode 100644 regex/rose/rose_types.h
 create mode 100644 regex/rose/runtime.h
 create mode 100644 regex/rose/stream.c
 create mode 100644 regex/rose/stream_long_lit.h
 create mode 100644 regex/rose/stream_long_lit_hash.h
 create mode 100644 regex/rose/validate_mask.h
 create mode 100644 regex/rose/validate_shufti.h
 create mode 100644 regex/runtime.c
 create mode 100644 regex/scratch.c
 create mode 100644 regex/scratch.h
 create mode 100644 regex/smallwrite/smallwrite_internal.h
 create mode 100644 regex/som/som_operation.h
 create mode 100644 regex/som/som_runtime.c
 create mode 100644 regex/som/som_runtime.h
 create mode 100644 regex/som/som_stream.c
 create mode 100644 regex/som/som_stream.h
 create mode 100644 regex/state.h
 create mode 100644 regex/stream_compress.c
 create mode 100644 regex/stream_compress.h
 create mode 100644 regex/stream_compress_impl.h
 create mode 100644 regex/ue2common.h
 create mode 100644 regex/util/arch.h
 create mode 100644 regex/util/bitutils.h
 create mode 100644 regex/util/compare.h
 create mode 100644 regex/util/copybytes.h
 create mode 100644 regex/util/cpuid_flags.c
 create mode 100644 regex/util/cpuid_flags.h
 create mode 100644 regex/util/cpuid_inline.h
 create mode 100644 regex/util/exhaust.h
 create mode 100644 regex/util/fatbit.h
 create mode 100644 regex/util/intrinsics.h
 create mode 100644 regex/util/join.h
 create mode 100644 regex/util/logical.h
 create mode 100644 regex/util/masked_move.c
 create mode 100644 regex/util/masked_move.h
 create mode 100644 regex/util/multibit.c
 create mode 100644 regex/util/multibit.h
 create mode 100644 regex/util/multibit_compress.h
 create mode 100644 regex/util/multibit_internal.h
 create mode 100644 regex/util/pack_bits.h
 create mode 100644 regex/util/partial_store.h
 create mode 100644 regex/util/popcount.h
 create mode 100644 regex/util/pqueue.h
 create mode 100644 regex/util/scatter.h
 create mode 100644 regex/util/scatter_runtime.h
 create mode 100644 regex/util/simd_types.h
 create mode 100644 regex/util/simd_utils.c
 create mode 100644 regex/util/simd_utils.h
 create mode 100644 regex/util/state_compress.c
 create mode 100644 regex/util/state_compress.h
 create mode 100644 regex/util/unaligned.h
 create mode 100644 regex/util/uniform_ops.h
 create mode 100755 scripts/install_regex.sh
 create mode 100755 scripts/regex_start.sh
 create mode 100755 scripts/regex_stop.sh

diff --git a/Makefile b/Makefile
index c23a2aae4..da11e2e49 100644
--- a/Makefile
+++ b/Makefile
@@ -146,7 +146,7 @@ KERNEL = /lib/modules/$(shell uname -r)/build
 
 export KERNEL TFW_CFLAGS AVX2 BMI2 ADX TFW_GCOV
 
-obj-m	+= lib/ db/core/ fw/ tls/
+obj-m	+= lib/ db/core/ regex/ fw/ tls/
 
 all: build
 
diff --git a/fw/cfg.c b/fw/cfg.c
index 44e47a3e6..6ec5c3e3a 100644
--- a/fw/cfg.c
+++ b/fw/cfg.c
@@ -109,6 +109,9 @@
  * them. Helpers below facilitate that.
  */
 
+unsigned short number_of_regex = 0;
+unsigned short number_of_db_regex = 0;
+
 static const char *
 __alloc_and_copy_literal(const char *src, size_t len, bool keep_bs)
 {
@@ -397,6 +400,9 @@ typedef enum {
 	TOKEN_SEMICOLON,
 	TOKEN_LITERAL,
 	TOKEN_ARROW,
+	TOKEN_TILDA,
+	TOKEN_REGEX,
+	TOKEN_REGEX_CI,
 	_TOKEN_COUNT,
 } token_t;
 
@@ -588,9 +594,12 @@ read_next_token(TfwCfgParserState *ps)
 				    TOKEN_NEQSIGN);
 		TFSM_COND_MOVE_EXIT(ps->c == '>' && ps->prev_c == '-',
 				    TOKEN_ARROW);
+		TFSM_COND_MOVE_EXIT(ps->c == '*' && ps->prev_c == '~',
+		                    TOKEN_REGEX_CI);
 
 		/* Special case to differ single equal sign from double one. */
 		TFSM_COND_MOVE(ps->c == '=', TS_EQSIGN);
+		TFSM_COND_MOVE(ps->c == '~', TS_TILDA);
 
 		/* Everything else is not a special character and therefore
 		 * it starts a literal. */
@@ -619,6 +628,14 @@ read_next_token(TfwCfgParserState *ps)
 		TFSM_JMP_EXIT(TOKEN_EQSIGN);
 	}
 
+	FSM_STATE(TS_TILDA) {
+		TFSM_COND_JMP_EXIT(!ps->c, TOKEN_REGEX);
+
+		/* If this is double equal sign, eat second sign and exit. */
+		TFSM_COND_MOVE_EXIT(ps->c == '*', TOKEN_REGEX_CI);
+		TFSM_JMP_EXIT(TOKEN_REGEX);
+	}
+
 	FSM_STATE(TS_COMMENT) {
 		TFSM_COND_JMP_EXIT(!ps->c, TOKEN_NA);
 
@@ -732,7 +749,21 @@ entry_set_cond(TfwCfgEntry *e, token_t cond_type, const char *src, int len)
 	if (!(e->name = alloc_and_copy_literal(name, name_len)))
 		return -ENOMEM;
 
-	rule->inv = cond_type == TOKEN_DEQSIGN ? false : true;
+	switch (cond_type) {
+	case TOKEN_REGEX:
+		rule->regex = TFW_REGEX_REGULAR;
+		rule->inv = false;
+		break;
+	case TOKEN_REGEX_CI:
+		rule->regex = TFW_REGEX_CI;
+		rule->inv = false;
+		break;
+	default:
+		rule->regex = TFW_REGEX_NO;
+		rule->inv = cond_type == TOKEN_DEQSIGN ? false : true;
+		break;
+	}
+
 	return 0;
 }
 
@@ -806,8 +837,10 @@ parse_cfg_entry(TfwCfgParserState *ps)
 
 	FSM_STATE(PS_PLAIN_OR_RULE) {
 		PFSM_COND_MOVE(ps->t == TOKEN_DEQSIGN ||
-			       ps->t == TOKEN_NEQSIGN,
-			       PS_RULE_COND);
+		               ps->t == TOKEN_NEQSIGN ||
+		               ps->t == TOKEN_REGEX ||
+		               ps->t == TOKEN_REGEX_CI,
+		               PS_RULE_COND);
 		PFSM_COND_MOVE(ps->t == TOKEN_LITERAL, PS_PLAIN_OR_LONG_RULE);
 
 		/* Jump to plain val/attr scheme to make remained checks
@@ -819,8 +852,10 @@ parse_cfg_entry(TfwCfgParserState *ps)
 
 	FSM_STATE(PS_PLAIN_OR_LONG_RULE) {
 		FSM_COND_JMP(ps->t == TOKEN_DEQSIGN ||
-			     ps->t == TOKEN_NEQSIGN,
-			     PS_LONG_RULE_COND);
+		             ps->t == TOKEN_NEQSIGN ||
+		             ps->t == TOKEN_REGEX ||
+		             ps->t == TOKEN_REGEX_CI,
+		             PS_LONG_RULE_COND);
 
 		/* This is not rule (simple or extended), so jump to
 		 * plain val/attr scheme. */
@@ -828,9 +863,9 @@ parse_cfg_entry(TfwCfgParserState *ps)
 		FSM_COND_JMP(ps->err, PS_EXIT);
 		FSM_COND_JMP(ps->t == TOKEN_EQSIGN, PS_STORE_ATTR_PREV);
 		FSM_COND_JMP(ps->t == TOKEN_LITERAL ||
-			     ps->t == TOKEN_SEMICOLON ||
-			     ps->t == TOKEN_LBRACE,
-			     PS_STORE_VAL_PREV);
+		             ps->t == TOKEN_SEMICOLON ||
+		             ps->t == TOKEN_LBRACE,
+		             PS_STORE_VAL_PREV);
 
 		ps->err = -EINVAL;
 		FSM_JMP(PS_EXIT);
@@ -838,16 +873,20 @@ parse_cfg_entry(TfwCfgParserState *ps)
 
 	FSM_STATE(PS_LONG_RULE_COND) {
 		ps->err = entry_add_rule_param(&ps->e.rule.fst_ext,
-					       ps->prev_lit,
-					       ps->prev_lit_len);
+		                               ps->prev_lit,
+		                               ps->prev_lit_len);
 		FSM_COND_JMP(ps->err, PS_EXIT);
 		PFSM_MOVE(PS_RULE_COND);
 	}
 
 	FSM_STATE(PS_RULE_COND) {
+		FSM_COND_JMP(ps->prev_t == TOKEN_REGEX ||
+		             ps->prev_t == TOKEN_REGEX_CI,
+		             PS_STORE_VAL_PREV_REGEX);
+
 		PFSM_COND_JMP_EXIT_ERROR(ps->t != TOKEN_LITERAL);
 		ps->err = entry_set_cond(&ps->e, ps->prev_t, ps->lit,
-					 ps->lit_len);
+		                         ps->lit_len);
 		FSM_COND_JMP(ps->err, PS_EXIT);
 		PFSM_MOVE(PS_RULE_COND_END);
 	}
@@ -866,7 +905,7 @@ parse_cfg_entry(TfwCfgParserState *ps)
 	FSM_STATE(PS_RULE_ACTION) {
 		PFSM_COND_JMP_EXIT_ERROR(ps->t != TOKEN_LITERAL);
 		ps->err = entry_add_rule_param(&ps->e.rule.act, ps->lit,
-					       ps->lit_len);
+		                               ps->lit_len);
 		FSM_COND_JMP(ps->err, PS_EXIT);
 		PFSM_MOVE(PS_RULE_ACTION_VAL);
 	}
@@ -878,7 +917,7 @@ parse_cfg_entry(TfwCfgParserState *ps)
 		PFSM_COND_JMP_EXIT_ERROR(ps->t != TOKEN_LITERAL);
 
 		ps->err = entry_add_rule_param(&ps->e.rule.val, ps->lit,
-					       ps->lit_len);
+		                               ps->lit_len);
 		FSM_COND_JMP(ps->err, PS_EXIT);
 
 		read_next_token(ps);
@@ -914,6 +953,38 @@ parse_cfg_entry(TfwCfgParserState *ps)
 		FSM_JMP(PS_STORE_VAL_PREV);
 	}
 
+	FSM_STATE(PS_STORE_VAL_PREV_REGEX) {
+		/* name val1 val2;
+		 *           ^
+		 *           We are here (but still need to store val1)
+		 *           and name or condition.
+		*/
+		T_DBG3("add value: %.*s\n", ps->prev_lit_len, ps->prev_lit);
+
+		if (ps->e.ftoken && !strcmp(ps->e.ftoken, "location")) {
+			ps->err = entry_set_name(&ps->e);
+
+			if (!ps->err) {
+				if (ps->prev_t == TOKEN_REGEX)
+					ps->err = entry_add_val(&ps->e, "regex",
+					                        sizeof("regex"));
+				if (ps->prev_t == TOKEN_REGEX_CI)
+					ps->err = entry_add_val(&ps->e,
+					                        "regex_ci",
+					                        sizeof("regex_ci"));
+			}
+			FSM_COND_JMP(ps->err, PS_EXIT);
+			FSM_JMP(PS_VAL_OR_ATTR);
+		}
+
+		/*If it is not location*/
+		ps->err = entry_set_cond(&ps->e, ps->prev_t,
+		                                         ps->lit, ps->lit_len);
+		FSM_COND_JMP(ps->err, PS_EXIT);
+		PFSM_MOVE(PS_RULE_COND_END);
+
+	}
+
 	FSM_STATE(PS_STORE_VAL_PREV) {
 		/* name val1 val2;
 		 *           ^
diff --git a/fw/cfg.h b/fw/cfg.h
index b8f672d16..2978f7c12 100644
--- a/fw/cfg.h
+++ b/fw/cfg.h
@@ -149,6 +149,7 @@ typedef struct {
 	const char *act;
 	const char *val;
 	bool inv;
+	int regex;
 } TfwCfgRule;
 
 typedef struct {
@@ -336,6 +337,9 @@ struct TfwCfgSpec {
 	void (*cleanup)(TfwCfgSpec *self);
 };
 
+extern unsigned short number_of_regex;
+extern unsigned short number_of_db_regex;
+
 /**
  * Walks over a NULL-terminated array of TfwCfgSpec structures.
  */
@@ -412,6 +416,12 @@ enum {
 	TFW_CFG_B_KEEP,		/* Keep an entry */
 };
 
+enum {
+	TFW_REGEX_NO = 0,
+	TFW_REGEX_REGULAR,
+	TFW_REGEX_CI,
+};
+
 #define TFW_CFG_F_ADD		(1 << TFW_CFG_B_ADD)
 #define TFW_CFG_F_DEL		(1 << TFW_CFG_B_DEL)
 #define TFW_CFG_F_MOD		(1 << TFW_CFG_B_MOD)
diff --git a/fw/http_match.c b/fw/http_match.c
index 67ae151c6..9b13d91b0 100644
--- a/fw/http_match.c
+++ b/fw/http_match.c
@@ -71,6 +71,7 @@
 #include "http_match.h"
 #include "http_msg.h"
 #include "cfg.h"
+#include "regex/kmod/rex.h"
 
 /**
  * Map an operator to that flags passed to tfw_str_eq_*() functions.
@@ -83,11 +84,34 @@ map_op_to_str_eq_flags(tfw_http_match_op_t op)
 		[TFW_HTTP_MATCH_O_EQ]		= TFW_STR_EQ_DEFAULT,
 		[TFW_HTTP_MATCH_O_PREFIX]	= TFW_STR_EQ_PREFIX,
 		[TFW_HTTP_MATCH_O_SUFFIX]	= TFW_STR_EQ_DEFAULT,
+	        [TFW_HTTP_MATCH_O_REGEX]	= TFW_STR_EQ_REGEX,
+	        [TFW_HTTP_MATCH_O_REGEX_CI]	= TFW_STR_EQ_REGEX_CASEI,
 	};
 	BUG_ON(flags_tbl[op] < 0);
 	return flags_tbl[op];
 }
 
+//extern int bpf_scan_bytes(const void *, __u32, struct rex_scan_attr *);
+
+extern int bpf_scan_tfwstr(const TfwStr *str, struct rex_scan_attr *attr);
+
+bool
+tfw_match_regex(tfw_match_t op, const char *cstr, size_t len, const TfwStr *arg)
+{
+        bool result;
+        int r;
+
+        struct rex_scan_attr attr = {};
+        memcpy(&attr.database_id, cstr, sizeof(unsigned short));
+
+        if (!arg->len)
+                return false;
+
+	r = bpf_scan_tfwstr(arg, &attr);
+	result = (!r && attr.nr_events && attr.last_event.expression);
+	return result;
+}
+
 static bool
 tfw_rule_str_match(const TfwStr *str, const char *cstr,
 		   int cstr_len, tfw_str_eq_flags_t flags,
@@ -97,6 +121,9 @@ tfw_rule_str_match(const TfwStr *str, const char *cstr,
 		return tfw_str_eq_cstr_off(str, str->len - cstr_len,
 					   cstr, cstr_len, flags);
 
+	if (op == TFW_HTTP_MATCH_O_REGEX)
+		return tfw_match_regex(op, cstr, cstr_len, str);
+
 	return tfw_str_eq_cstr(str, cstr, cstr_len, flags);
 }
 
@@ -706,10 +733,93 @@ tfw_http_escape_pre_post(char *out , const char *str)
 	return len;
 }
 
+/*
+ * Here we create text file for every regex string which
+ * can be readed by hscollider.
+ * Next hscollider compile it and save to temporary DB.
+ * After it will be loaded to regex module DB.
+ * All operations after creating will be done in script start_regex.sh
+ *
+ * As it potentially possible situation then one DB conains several
+ * expressions, here are two variables:
+ * number_of_db_regex - nomber of databes which we will use to look for
+ * expression;
+ * number_of_regex - number of expression to know wich exactly expression
+ * was matched (parsing for it has not not implemented yet)
+ *
+ * After this function, number_of_db_regex will be written to start of arg,
+ * so the lenght of regex string must be longer then two bytes.
+ *
+ * Directory /tmp/tempesata is created from
+ * tempesta.sh script.
+ */
+int
+write_regex(const char *arg, int regex)
+{
+	struct file *fl;
+	loff_t off = 0;
+	int r;
+	char file_name[25];
+	char reg_number[6];
+	int len = strlen(arg);
+	int len1;
+
+	if (len < sizeof(unsigned short)) {
+		T_ERR_NL("String of regex too short\n");
+		return -EINVAL;
+	}
+
+	++number_of_db_regex;
+	sprintf(file_name, "/tmp/tempesta/%u.txt", number_of_db_regex);
+
+	fl = filp_open(file_name, O_CREAT | O_WRONLY, 0600);
+	if (IS_ERR(fl)) {
+		T_ERR_NL("Cannot create regex file %s\n",
+		          file_name);
+		return -EINVAL;
+	}
+	BUG_ON(!fl || !fl->f_path.dentry);
+
+	if (!fl->f_op->fallocate) {
+		T_ERR_NL("File requires filesystem with fallocate support\n");
+		filp_close(fl, NULL);
+		return -EINVAL;
+	}
+
+	++number_of_regex;
+	sprintf(reg_number, "%i:", number_of_regex);
+	len1 = strlen(reg_number);
+	r = kernel_write(fl, (void *)reg_number, len1, &off);
+	if (r != len1)
+		goto err;
+
+	r = kernel_write(fl, (void *)arg, len, &off);
+	if (r != len)
+		goto err;
+
+	if (regex == TFW_REGEX_CI) {
+		r = kernel_write(fl, "i", 1, &off);
+		if (r != 1)
+			goto err;
+	}
+
+	r = kernel_write(fl, "\n", 1, &off);
+	if (r != 1)
+		goto err;
+
+	filp_close(fl, NULL);
+	return 0;
+err:
+	T_ERR_NL("Cannot write regex\n");
+	filp_close(fl, NULL);
+	return r;
+}
+
 const char *
 tfw_http_arg_adjust(const char *arg, tfw_http_match_fld_t field,
-		    const char *raw_hdr_name, size_t *size_out,
-		    tfw_http_match_arg_t *type_out,
+                    const char *raw_hdr_name, int regex,
+                    size_t *size_out,
+                    tfw_http_match_arg_t *type_out,
 		    tfw_http_match_op_t *op_out)
 {
 	char *arg_out, *pos;
@@ -751,6 +861,11 @@ tfw_http_arg_adjust(const char *arg, tfw_http_match_fld_t field,
 	if (wc_arg || (len > 1 && arg[len - 1] == '*' && arg[len - 2] != '\\'))
 		*op_out = TFW_HTTP_MATCH_O_PREFIX;
 
+	if (!wc_arg && regex) {
+		*op_out = TFW_HTTP_MATCH_O_REGEX;
+		write_regex(arg, regex);
+	}
+
 	/*
 	 * For argument started with wildcard, the suffix matching
 	 * pattern should be applied.
@@ -779,6 +894,12 @@ tfw_http_arg_adjust(const char *arg, tfw_http_match_fld_t field,
 	len = tfw_http_escape_pre_post(pos, arg);
 	*size_out += full_name_len + len + 1;
 
+	/*
+	 * Save number_of_db_regex to use it in tfw_match_regex
+	 */
+	if (*op_out == TFW_HTTP_MATCH_O_REGEX)
+		memcpy(arg_out, &number_of_db_regex, sizeof(number_of_db_regex));
+
 	return arg_out;
 }
 
diff --git a/fw/http_match.h b/fw/http_match.h
index 3da5b3df0..53c5d27f5 100644
--- a/fw/http_match.h
+++ b/fw/http_match.h
@@ -52,6 +52,8 @@ typedef enum {
 	TFW_HTTP_MATCH_O_EQ,
 	TFW_HTTP_MATCH_O_PREFIX,
 	TFW_HTTP_MATCH_O_SUFFIX,
+	TFW_HTTP_MATCH_O_REGEX,
+	TFW_HTTP_MATCH_O_REGEX_CI,/*case insensitive*/
 	_TFW_HTTP_MATCH_O_COUNT
 } tfw_http_match_op_t;
 
@@ -156,7 +158,8 @@ TfwHttpMatchRule *tfw_http_rule_new(TfwHttpChain *chain,
 int tfw_http_rule_arg_init(TfwHttpMatchRule *rule, const char *arg,
 			   size_t arg_len);
 const char *tfw_http_arg_adjust(const char *arg, tfw_http_match_fld_t field,
-				const char *raw_hdr_name, size_t *size_out,
+                                const char *raw_hdr_name, int regex,
+                                size_t *size_out,
 				tfw_http_match_arg_t *type_out,
 				tfw_http_match_op_t *op_out);
 const char *tfw_http_val_adjust(const char *val, tfw_http_match_fld_t field,
@@ -170,6 +173,11 @@ int tfw_http_search_cookie(const char *cstr, unsigned long clen,
 			   TfwStr **pos, TfwStr *end, TfwStr *val,
 			   tfw_http_match_op_t op, bool is_resp_hdr);
 
+int write_regex(const char *arg, int regex);
+
+bool tfw_match_regex(tfw_match_t op, const char *cstr, size_t len,
+                     const TfwStr *arg);
+
 #define tfw_http_chain_rules_for_each(chain, func)			\
 ({									\
 	int r = 0;							\
diff --git a/fw/http_tbl.c b/fw/http_tbl.c
index cf9a9ef8b..5abf30798 100644
--- a/fw/http_tbl.c
+++ b/fw/http_tbl.c
@@ -382,7 +382,7 @@ tfw_cfgop_http_rule(TfwCfgSpec *cs, TfwCfgEntry *e)
 	const char *in_field, *in_field_val, *action, *action_val,
 		   *in_arg, *arg = NULL, *val = NULL;
 	unsigned int invert, hid = TFW_HTTP_HDR_RAW,
-		     act_val_parsed, val_len;
+	             act_val_parsed, val_len, regex;
 	tfw_http_match_op_t op = TFW_HTTP_MATCH_O_WILDCARD,
 			    op_val = TFW_HTTP_MATCH_O_WILDCARD;
 	tfw_http_match_fld_t field = TFW_HTTP_MATCH_F_WILDCARD;
@@ -398,6 +398,7 @@ tfw_cfgop_http_rule(TfwCfgSpec *cs, TfwCfgEntry *e)
 	TFW_CFG_CHECK_NO_ATTRS(cs, e);
 
 	invert = cfg_rule->inv;
+	regex = cfg_rule->regex;
 	in_field = cfg_rule->fst;
 	in_field_val = cfg_rule->fst_ext;
 	in_arg = cfg_rule->snd;
@@ -432,7 +433,8 @@ tfw_cfgop_http_rule(TfwCfgSpec *cs, TfwCfgEntry *e)
 		}
 
 		arg = tfw_http_arg_adjust(in_arg, field, in_field_val,
-					  &arg_size, &type, &op);
+		                          cfg_rule->regex, &arg_size,
+		                          &type, &op);
 		if (IS_ERR(arg))
 			return PTR_ERR(arg);
 	}
diff --git a/fw/str.h b/fw/str.h
index 136860fd2..27fda9d64 100644
--- a/fw/str.h
+++ b/fw/str.h
@@ -437,6 +437,8 @@ typedef enum {
 	TFW_STR_EQ_PREFIX  = 0x1,
 	TFW_STR_EQ_CASEI   = 0x2,
 	TFW_STR_EQ_PREFIX_CASEI = (TFW_STR_EQ_PREFIX | TFW_STR_EQ_CASEI),
+	TFW_STR_EQ_REGEX = 0x4,
+	TFW_STR_EQ_REGEX_CASEI = (TFW_STR_EQ_REGEX | TFW_STR_EQ_CASEI),
 } tfw_str_eq_flags_t;
 
 int tfw_strcpy(TfwStr *dst, const TfwStr *src);
diff --git a/fw/t/unit/test_http_match.c b/fw/t/unit/test_http_match.c
index ec8cb88c0..9da170ad2 100644
--- a/fw/t/unit/test_http_match.c
+++ b/fw/t/unit/test_http_match.c
@@ -139,7 +139,8 @@ test_chain_add_rule_str(int test_id, tfw_http_match_fld_t field,
 		tfw_http_verify_hdr_field(field, &in_val, &hid);
 	}
 	val = tfw_http_val_adjust(in_val, field, &val_len, &val_type, &op_val);
-	arg = tfw_http_arg_adjust(in_arg, field, in_val, &arg_size, &type, &op);
+	arg = tfw_http_arg_adjust(in_arg, field, in_val, 0,
+	                          &arg_size, &type, &op);
 	EXPECT_NOT_NULL(arg);
 	if (!arg)
 		return;
diff --git a/fw/vhost.c b/fw/vhost.c
index eefd2ce41..19a967283 100644
--- a/fw/vhost.c
+++ b/fw/vhost.c
@@ -39,6 +39,7 @@
 #include "http_sess.h"
 #include "client.h"
 #include "tls_conf.h"
+#include "regex/kmod/rex.h"
 
 /*
  * The hash table entry for mapping @sni to @vhost for SAN certificates handling.
@@ -75,6 +76,10 @@ static const TfwCfgEnum tfw_match_enum[] = {
 	{ "eq",		TFW_HTTP_MATCH_O_EQ },
 	{ "prefix",	TFW_HTTP_MATCH_O_PREFIX },
 	{ "suffix",	TFW_HTTP_MATCH_O_SUFFIX },
+        /*regex case sensitive*/
+        { "regex",	TFW_HTTP_MATCH_O_REGEX },
+        /*regex* case insensitive*/
+        { "regex_ci",	TFW_HTTP_MATCH_O_REGEX_CI },
 	{ 0 }
 };
 
@@ -177,6 +182,14 @@ __tfw_match_prefix(tfw_match_t op, const char *cstr, size_t len, TfwStr *arg)
 	return tfw_str_eq_cstr(arg, cstr, len, flags);
 }
 
+extern int bpf_scan_bytes(const void *, __u32, struct rex_scan_attr *);
+
+static bool
+__tfw_match_regex(tfw_match_t op, const char *cstr, size_t len, TfwStr *arg)
+{
+	return tfw_match_regex(op, cstr, len, arg);
+}
+
 typedef bool (*__tfw_match_fn)(tfw_match_t, const char *, size_t, TfwStr *);
 
 static const __tfw_match_fn __tfw_match_fn_tbl[] = {
@@ -185,6 +198,8 @@ static const __tfw_match_fn __tfw_match_fn_tbl[] = {
 	[TFW_HTTP_MATCH_O_EQ]		= __tfw_match_eq,
 	[TFW_HTTP_MATCH_O_PREFIX]	= __tfw_match_prefix,
 	[TFW_HTTP_MATCH_O_SUFFIX]	= __tfw_match_suffix,
+        [TFW_HTTP_MATCH_O_REGEX]	= __tfw_match_regex,
+        [TFW_HTTP_MATCH_O_REGEX_CI]     = __tfw_match_regex,
 };
 
 /*
@@ -1290,8 +1305,15 @@ tfw_location_init(TfwLocation *loc, tfw_match_t op, const char *arg,
 		    + sizeof(TfwHdrModsDesc) * TFW_USRHDRS_ARRAY_SZ * 2
 		    + sizeof(TfwHdrModsDesc *) * TFW_HTTP_HDR_RAW * 2;
 
-	if ((argmem = kmalloc(len + 1, GFP_KERNEL)) == NULL)
-		return -ENOMEM;
+	if (op != TFW_HTTP_MATCH_O_REGEX) {
+		if ((argmem = kmalloc(len + 1, GFP_KERNEL)) == NULL)
+			return -ENOMEM;
+	}
+	else {/*If it is a regex we need only number of DB*/
+		if ((argmem = kmalloc(2 + 1, GFP_KERNEL)) == NULL)
+			return -ENOMEM;
+	}
+
 	if ((data = kzalloc(size, GFP_KERNEL)) == NULL) {
 		kfree(argmem);
 		return -ENOMEM;
@@ -1325,7 +1347,27 @@ tfw_location_init(TfwLocation *loc, tfw_match_t op, const char *arg,
 		(TfwHdrModsDesc **)(loc->mod_hdrs[TFW_VHOST_HDRMOD_RESP].hdrs
 				    + TFW_USRHDRS_ARRAY_SZ);
 
-	memcpy((void *)loc->arg, (void *)arg, len + 1);
+	switch (op) {
+	case TFW_HTTP_MATCH_O_REGEX:
+		write_regex(arg, TFW_REGEX_REGULAR);
+		/*
+		* Save number_of_db_regex to use it in tfw_match_regex
+		*/
+		memcpy((void *)loc->arg, (void *)&number_of_db_regex,
+		       sizeof(number_of_db_regex));
+		break;
+	case TFW_HTTP_MATCH_O_REGEX_CI:
+		write_regex(arg, TFW_REGEX_CI);
+		/*
+		* Save number_of_db_regex to use it in tfw_match_regex
+		*/
+		memcpy((void *)loc->arg, (void *)&number_of_db_regex,
+		        sizeof(number_of_db_regex));
+		break;
+	default:
+		memcpy((void *)loc->arg, (void *)arg, len + 1);
+		break;
+	}
 
 	return 0;
 }
@@ -1344,7 +1386,6 @@ tfw_location_new(TfwVhost *vhost, tfw_match_t op, const char *arg, size_t len)
 	if (tfw_location_init(loc, op, arg, len, vhost->hdrs_pool))
 		return NULL;
 	vhost->loc_sz++;
-
 	if (tfw_frang_cfg_inherit(loc->frang_cfg, vhost->loc_dflt->frang_cfg))
 		return NULL;
 
@@ -2351,6 +2392,9 @@ tfw_vhost_cfgstart(void)
 {
 	TfwVhost *vh_dflt;
 
+	number_of_regex = 0;
+	number_of_db_regex = 0;
+
 	BUG_ON(tfw_vhosts_reconfig);
 	tfw_vhosts_reconfig = kmalloc(sizeof(TfwVhostList), GFP_KERNEL);
 	if (!tfw_vhosts_reconfig) {
diff --git a/install.txt b/install.txt
new file mode 100644
index 000000000..0199af594
--- /dev/null
+++ b/install.txt
@@ -0,0 +1,42 @@
+Colm (Colm Programming Language)
+git clone https://github.com/adrian-thurston/colm.git
+
+$ ./autogen.sh
+$ ./configure
+$ make
+$ make install
+
+add LD_LIBRARY_PATH="/usr/local/lib" to /etc/environment
+
+
+Regal
+git clone https://github.com/adrian-thurston/ragel.git
+
+$ ./autogen.sh
+$ ./configure --with-colm=/usr/local
+$ make
+$ make install
+
+
+PCRE
+download PCRE from sourceforge
+wget https://sourceforge.net/projects/pcre/files/pcre/8.45/pcre-8.45.tar.gz
+tar -xf archive.tar.gz
+
+$ ./configure  --enable-pcre16 --enable-pcre32
+$ make
+$ make install
+
+
+
+git clone https://github.com/tempesta-tech/linux-regex-module.git
+
+cmake ./
+make
+
+after compilation
+copy hscollider from /linux-regex-module/bin/ to /tempesta/scripts/ or default app directory???
+
+git clone https://github.com/tempesta-tech/tempesta.git
+cd tempesta
+git checkout ag_Multi-pattern-regular-expressions
diff --git a/regex/Makefile b/regex/Makefile
new file mode 100644
index 000000000..3d5041566
--- /dev/null
+++ b/regex/Makefile
@@ -0,0 +1,82 @@
+obj-m := xdp_rex.o
+
+CC_FLAGS_HSRUNTIME	:= -isystem $(shell $(CC) -print-file-name=include)
+CC_FLAGS_REMOVE_SIMD	:= -mno-80387 -mno-fp-ret-in-387 -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx
+CC_FLAGS_HSRUNTIME	+= -DHAVE_SSE2
+CC_FLAGS_SIMD		:= -msse4.2 -msse4.1
+CC_FLAGS_HSRUNTIME	+= -DHAVE_SSE41 -DHAVE_SSE42
+CC_FLAGS_SIMD		+= -mavx -mavx2
+CC_FLAGS_HSRUNTIME	+= -DHAVE_AVX -DHAVE_AVX2
+#CC_FLAGS_SIMD		+= -mavx512f -mavx512cd -mavx512bw -mavx512vl -mavx512vnni
+#CC_FLAGS_HSRUNTIME	+= -DHAVE_AVX512
+#CC_FLAGS_SIMD		+= -mavx512vbmi -mavx512vbmi2 -mavx512vnni
+#CC_FLAGS_HSRUNTIME	+= -DHAVE_AVX512VBMI
+
+CC_FLAGS_HSRUNTIME		+= $(CC_FLAGS_SIMD)
+CC_FLAGS_REMOVE_HSRUNTIME	:= $(CC_FLAGS_REMOVE_SIMD)
+CC_FLAGS_REMOVE_HSRUNTIME	+= -Wdeclaration-after-statement
+CC_FLAGS_HSRUNTIME		+= -Wframe-larger-than=2048
+CC_FLAGS_HSRUNTIME		+= -std=gnu11
+CC_FLAGS_REMOVE_HSRUNTIME	+= -std=gnu99
+
+ccflags-y			+= -std=c99
+ccflags-y			+= -I$(src) -I$(src)/kmod -I$(src)/../
+ccflags-y			+= $(CC_FLAGS_HSRUNTIME)
+ccflags-remove-y		+= $(CC_FLAGS_REMOVE_HSRUNTIME)
+
+CFLAGS_kmod/rex.o		:= $(CC_FLAGS_REMOVE_HSRUNTIME)
+CFLAGS_REMOVE_kmod/rex.o	:= $(CC_FLAGS_HSRUNTIME)
+CFLAGS_alloc.o			:= $(CC_FLAGS_REMOVE_SIMD)
+CFLAGS_REMOVE_alloc.o		:= $(CC_FLAGS_SIMD)
+CFLAGS_scratch.o		:= $(CC_FLAGS_REMOVE_SIMD)
+CFLAGS_REMOVE_scratch.o		:= $(CC_FLAGS_SIMD)
+CFLAGS_database.o		:= $(CC_FLAGS_REMOVE_SIMD)
+CFLAGS_REMOVE_database.o	:= $(CC_FLAGS_SIMD)
+
+xdp_rex-m	:= kmod/rex.o               \
+		   alloc.o                  \
+		   scratch.o                \
+		   runtime.o                \
+		   database.o               \
+		   hs_version.o             \
+		   stream_compress.o        \
+		   fdr/fdr.o                \
+		   fdr/teddy_avx2.o         \
+		   fdr/teddy.o              \
+		   hwlm/hwlm.o              \
+		   hwlm/noodle_engine.o     \
+		   nfa/accel.o              \
+		   nfa/castle.o             \
+		   nfa/gough.o              \
+		   nfa/lbr.o                \
+		   nfa/limex_64.o           \
+		   nfa/limex_accel.o        \
+		   nfa/limex_native.o       \
+		   nfa/limex_simd128.o      \
+		   nfa/limex_simd256.o      \
+		   nfa/limex_simd384.o      \
+		   nfa/limex_simd512.o      \
+		   nfa/mcclellan.o          \
+		   nfa/mcsheng.o            \
+		   nfa/mcsheng_data.o       \
+		   nfa/mpv.o                \
+		   nfa/nfa_api_dispatch.o   \
+		   nfa/repeat.o             \
+		   nfa/sheng.o              \
+		   nfa/shufti.o             \
+		   nfa/tamarama.o           \
+		   nfa/truffle.o            \
+		   rose/block.o             \
+		   rose/catchup.o           \
+		   rose/init.o              \
+		   rose/match.o             \
+		   rose/program_runtime.o   \
+		   rose/stream.o            \
+		   som/som_runtime.o        \
+		   som/som_stream.o         \
+		   util/cpuid_flags.o       \
+		   util/masked_move.o       \
+		   util/multibit.o          \
+		   util/simd_utils.o        \
+		   util/state_compress.o    \
+#
diff --git a/regex/alloc.c b/regex/alloc.c
new file mode 100644
index 000000000..27c9111fb
--- /dev/null
+++ b/regex/alloc.c
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Runtime functions for setting custom allocators.
+ */
+
+#ifndef __KERNEL__
+#include <stdlib.h>
+#include <string.h>
+#else
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/preempt.h>
+#endif
+
+#include "allocator.h"
+
+#if !defined(__KERNEL__)
+
+#define default_malloc malloc
+#define default_free free
+
+#else
+
+static void *default_malloc(size_t size) {
+    WARN_ON_ONCE(in_serving_softirq());
+    return kmalloc(size, GFP_KERNEL);
+}
+
+static void default_free(void *ptr) {
+    WARN_ON_ONCE(in_serving_softirq());
+    return kfree(ptr);
+}
+
+#endif
+
+hs_alloc_t hs_database_alloc = default_malloc;
+hs_alloc_t hs_misc_alloc = default_malloc;
+hs_alloc_t hs_scratch_alloc = default_malloc;
+hs_alloc_t hs_stream_alloc = default_malloc;
+
+hs_free_t hs_database_free = default_free;
+hs_free_t hs_misc_free = default_free;
+hs_free_t hs_scratch_free = default_free;
+hs_free_t hs_stream_free = default_free;
+
+static
+hs_alloc_t normalise_alloc(hs_alloc_t a) {
+    if (!a) {
+        return default_malloc;
+    } else {
+        return a;
+    }
+}
+
+static
+hs_free_t normalise_free(hs_free_t f) {
+    if (!f) {
+        return default_free;
+    } else {
+        return f;
+    }
+}
+
+HS_PUBLIC_API
+hs_error_t HS_CDECL hs_set_allocator(hs_alloc_t allocfunc, hs_free_t freefunc) {
+    hs_set_database_allocator(allocfunc, freefunc);
+    hs_set_misc_allocator(allocfunc, freefunc);
+    hs_set_stream_allocator(allocfunc, freefunc);
+    hs_set_scratch_allocator(allocfunc, freefunc);
+
+    return HS_SUCCESS;
+}
+
+HS_PUBLIC_API
+hs_error_t HS_CDECL hs_set_database_allocator(hs_alloc_t allocfunc,
+                                              hs_free_t freefunc) {
+    hs_database_alloc = normalise_alloc(allocfunc);
+    hs_database_free = normalise_free(freefunc);
+
+    return HS_SUCCESS;
+}
+
+HS_PUBLIC_API
+hs_error_t HS_CDECL hs_set_misc_allocator(hs_alloc_t allocfunc,
+                                          hs_free_t freefunc) {
+    hs_misc_alloc = normalise_alloc(allocfunc);
+    hs_misc_free = normalise_free(freefunc);
+
+    return HS_SUCCESS;
+}
+
+HS_PUBLIC_API
+hs_error_t HS_CDECL hs_set_scratch_allocator(hs_alloc_t allocfunc,
+                                             hs_free_t freefunc) {
+    hs_scratch_alloc = normalise_alloc(allocfunc);
+    hs_scratch_free = normalise_free(freefunc);
+
+    return HS_SUCCESS;
+}
+
+HS_PUBLIC_API
+hs_error_t HS_CDECL hs_set_stream_allocator(hs_alloc_t allocfunc,
+                                            hs_free_t freefunc) {
+    hs_stream_alloc = normalise_alloc(allocfunc);
+    hs_stream_free = normalise_free(freefunc);
+
+    return HS_SUCCESS;
+}
diff --git a/regex/allocator.h b/regex/allocator.h
new file mode 100644
index 000000000..61c20f914
--- /dev/null
+++ b/regex/allocator.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef ALLOCATOR_H
+#define ALLOCATOR_H
+
+#include "hs_common.h"
+#include "ue2common.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+extern hs_alloc_t hs_database_alloc;
+extern hs_alloc_t hs_misc_alloc;
+extern hs_alloc_t hs_scratch_alloc;
+extern hs_alloc_t hs_stream_alloc;
+
+extern hs_free_t hs_database_free;
+extern hs_free_t hs_misc_free;
+extern hs_free_t hs_scratch_free;
+extern hs_free_t hs_stream_free;
+#ifdef __cplusplus
+} /* extern C */
+#endif
+/** \brief Check the results of an alloc done with hs_alloc for alignment.
+ *
+ * If we have incorrect alignment, return an error. Caller should free the
+ * offending block. */
+static really_inline
+hs_error_t hs_check_alloc(const void *mem) {
+    hs_error_t ret = HS_SUCCESS;
+    if (!mem) {
+        ret = HS_NOMEM;
+    } else if (!ISALIGNED_N(mem, alignof(unsigned long long))) {
+        ret = HS_BAD_ALLOC;
+    }
+    return ret;
+}
+
+#endif
diff --git a/regex/build.sh b/regex/build.sh
new file mode 100755
index 000000000..48e65a18d
--- /dev/null
+++ b/regex/build.sh
@@ -0,0 +1,17 @@
+#!/bin/bash -xe
+
+kernel_source_dir="$1"
+linux_image=/boot/"vmlinuz-${kernelver}"
+shift 1
+
+# Make our own source tree and extract vmlinux into it.
+subdirs=$(ls -A "${kernel_source_dir}"/)
+mkdir -p linux
+for d in $subdirs; do
+    ln -s "${kernel_source_dir}"/"$d" linux/"$d"
+done
+
+linux/scripts/extract-vmlinux "${linux_image}" \
+    > linux/vmlinux
+
+exec make -C linux "$@"
diff --git a/regex/crc32.h b/regex/crc32.h
new file mode 100644
index 000000000..f9c960c10
--- /dev/null
+++ b/regex/crc32.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef CRC32_H_36A5015B5840C1
+#define CRC32_H_36A5015B5840C1
+
+#include "ue2common.h"
+
+#ifndef __KERNEL__
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+u32 Crc32c_ComputeBuf(u32 inCrc32, const void *buf, size_t bufLen);
+
+#ifdef __cplusplus
+}
+#endif
+
+#else /* __KERNEL */
+
+#include <linux/crc32.h>
+
+static inline u32 Crc32c_ComputeBuf(u32 inCrc32, const void *buf, size_t bufLen) {
+    return __crc32c_le(inCrc32, (unsigned char const*) buf, bufLen);
+}
+
+#endif
+
+#endif /* CRC32_H_36A5015B5840C1 */
+
diff --git a/regex/database.c b/regex/database.c
new file mode 100644
index 000000000..09d49b0be
--- /dev/null
+++ b/regex/database.c
@@ -0,0 +1,474 @@
+/*
+ * Copyright (c) 2015-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Runtime code for hs_database manipulation.
+  */
+
+#ifndef __KERNEL__
+#include <stdio.h>
+#include <string.h>
+#else
+#include <linux/kernel.h>
+#include <linux/string.h>
+#endif
+
+#include "allocator.h"
+#include "hs_common.h"
+#include "hs_internal.h"
+#include "hs_version.h"
+#include "ue2common.h"
+#include "database.h"
+#include "crc32.h"
+#include "rose/rose_internal.h"
+#include "util/unaligned.h"
+
+static really_inline
+int db_correctly_aligned(const void *db) {
+    return ISALIGNED_N(db, alignof(unsigned long long));
+}
+
+HS_PUBLIC_API
+hs_error_t HS_CDECL hs_free_database(hs_database_t *db) {
+    if (db && db->magic != HS_DB_MAGIC) {
+        return HS_INVALID;
+    }
+    hs_database_free(db);
+
+    return HS_SUCCESS;
+}
+
+HS_PUBLIC_API
+hs_error_t HS_CDECL hs_serialize_database(const hs_database_t *db, char **bytes,
+                                          size_t *serialized_length) {
+    if (!db || !bytes || (!serialized_length && !*bytes)) {
+        return HS_INVALID;
+    }
+
+    if (!db_correctly_aligned(db)) {
+        return HS_BAD_ALIGN;
+    }
+
+    hs_error_t ret = validDatabase(db);
+    if (ret != HS_SUCCESS) {
+        return ret;
+    }
+
+    size_t length = sizeof(struct hs_database) + db->length;
+    char *out;
+
+    if (serialized_length) {
+        out = hs_misc_alloc(length);
+        ret = hs_check_alloc(out);
+        if (ret != HS_SUCCESS) {
+            hs_misc_free(out);
+            return ret;
+        }
+    } else {
+        out = *bytes;
+    }
+
+    memset(out, 0, length);
+
+    u32 *buf = (u32 *)out;
+    *buf = db->magic;
+    buf++;
+    *buf = db->version;
+    buf++;
+    *buf = db->length;
+    buf++;
+    memcpy(buf, &db->platform, sizeof(u64a));
+    buf += 2;
+    *buf = db->crc32;
+    buf++;
+    *buf = db->reserved0;
+    buf++;
+    *buf = db->reserved1;
+    buf++;
+
+    const char *bytecode = hs_get_bytecode(db);
+    memcpy(buf, bytecode, db->length);
+
+    if (serialized_length) {
+        *bytes = out;
+        *serialized_length = length;
+    }
+    return HS_SUCCESS;
+}
+
+// check that the database header's platform is compatible with the current
+// runtime platform.
+static
+hs_error_t db_check_platform(const u64a p) {
+    if (p != hs_current_platform
+        && p != (hs_current_platform | hs_current_platform_no_avx2)
+        && p != (hs_current_platform | hs_current_platform_no_avx512)
+        && p != (hs_current_platform | hs_current_platform_no_avx512vbmi)) {
+        return HS_DB_PLATFORM_ERROR;
+    }
+    // passed all checks
+    return HS_SUCCESS;
+}
+
+// Decode and check the database header, returning appropriate errors or
+// HS_SUCCESS if it's OK. The header should be allocated on the stack
+// and later copied into the deserialized database.
+static
+hs_error_t db_decode_header(const char **bytes, const size_t length,
+                            struct hs_database *header) {
+    if (!*bytes) {
+        return HS_INVALID;
+    }
+
+    if (length < sizeof(struct hs_database)) {
+        return HS_INVALID;
+    }
+
+    // There's no requirement, really, that the serialized stream of bytes
+    // we've been given is 4-byte aligned, so we use unaligned loads here.
+
+    const u32 *buf = (const u32 *)*bytes;
+
+    // Zero header so that none of it (e.g. its padding) is uninitialized.
+    memset(header, 0, sizeof(struct hs_database));
+
+    header->magic = unaligned_load_u32(buf++);
+    if (header->magic != HS_DB_MAGIC) {
+        return HS_INVALID;
+    }
+
+    header->version = unaligned_load_u32(buf++);
+    if (header->version != HS_DB_VERSION) {
+        return HS_DB_VERSION_ERROR;
+    }
+
+    header->length = unaligned_load_u32(buf++);
+    if (length != sizeof(struct hs_database) + header->length) {
+        DEBUG_PRINTF("bad length %zu, expecting %zu\n", length,
+                     sizeof(struct hs_database) + header->length);
+        return HS_INVALID;
+    }
+
+    header->platform = unaligned_load_u64a(buf);
+    buf += 2;
+    header->crc32 = unaligned_load_u32(buf++);
+    header->reserved0 = unaligned_load_u32(buf++);
+    header->reserved1 = unaligned_load_u32(buf++);
+
+    *bytes = (const char *)buf;
+
+    return HS_SUCCESS; // Header checks out
+}
+
+// Check the CRC on a database
+static
+hs_error_t db_check_crc(const hs_database_t *db) {
+    const char *bytecode = hs_get_bytecode(db);
+    u32 crc = Crc32c_ComputeBuf(0, bytecode, db->length);
+    if (crc != db->crc32) {
+        DEBUG_PRINTF("crc mismatch! 0x%x != 0x%x\n", crc, db->crc32);
+        return HS_INVALID;
+    }
+    return HS_SUCCESS;
+}
+
+static
+void db_copy_bytecode(const char *serialized, hs_database_t *db) {
+    // we need to align things manually
+    uintptr_t shift = (uintptr_t)db->bytes & 0x3f;
+    db->bytecode = offsetof(struct hs_database, bytes) - shift;
+    char *bytecode = (char *)db + db->bytecode;
+
+    // Copy the bytecode into place
+    memcpy(bytecode, serialized, db->length);
+}
+
+HS_PUBLIC_API
+hs_error_t HS_CDECL hs_deserialize_database_at(const char *bytes,
+                                               const size_t length,
+                                               hs_database_t *db) {
+    if (!bytes || !db) {
+        return HS_INVALID;
+    }
+
+    // We require the user to deserialize into an 8-byte aligned region.
+    if (!ISALIGNED_N(db, 8)) {
+        return HS_BAD_ALIGN;
+    }
+
+    // Decode the header
+    hs_database_t header;
+    hs_error_t ret = db_decode_header(&bytes, length, &header);
+    if (ret != HS_SUCCESS) {
+        return ret;
+    }
+
+    // Make sure the serialized database is for our platform
+    ret = db_check_platform(header.platform);
+    if (ret != HS_SUCCESS) {
+        return ret;
+    }
+
+    // Zero new space for safety
+    size_t dblength = sizeof(struct hs_database) + header.length;
+    memset(db, 0, dblength);
+
+    // Copy the decoded header into place
+    memcpy(db, &header, sizeof(header));
+
+    // Copy the bytecode into the correctly-aligned location, set offsets
+    db_copy_bytecode(bytes, db);
+
+    if (db_check_crc(db) != HS_SUCCESS) {
+        return HS_INVALID;
+    }
+
+    return HS_SUCCESS;
+}
+
+HS_PUBLIC_API
+hs_error_t HS_CDECL hs_deserialize_database(const char *bytes,
+                                            const size_t length,
+                                            hs_database_t **db) {
+    if (!bytes || !db) {
+        return HS_INVALID;
+    }
+
+    *db = NULL;
+
+    // Decode and check the header
+    hs_database_t header;
+    hs_error_t ret = db_decode_header(&bytes, length, &header);
+    if (ret != HS_SUCCESS) {
+        return ret;
+    }
+
+    // Make sure the serialized database is for our platform
+    ret = db_check_platform(header.platform);
+    if (ret != HS_SUCCESS) {
+        return ret;
+    }
+
+    // Allocate space for new database
+    size_t dblength = sizeof(struct hs_database) + header.length;
+    struct hs_database *tempdb = hs_database_alloc(dblength);
+    ret = hs_check_alloc(tempdb);
+    if (ret != HS_SUCCESS) {
+        hs_database_free(tempdb);
+        return ret;
+    }
+
+    // Zero new space for safety
+    memset(tempdb, 0, dblength);
+
+    // Copy the decoded header into place
+    memcpy(tempdb, &header, sizeof(header));
+
+    // Copy the bytecode into the correctly-aligned location, set offsets
+    db_copy_bytecode(bytes, tempdb);
+
+    if (db_check_crc(tempdb) != HS_SUCCESS) {
+        hs_database_free(tempdb);
+        return HS_INVALID;
+    }
+
+    *db = tempdb;
+    return HS_SUCCESS;
+}
+
+HS_PUBLIC_API
+hs_error_t HS_CDECL hs_database_size(const hs_database_t *db, size_t *size) {
+    if (!size) {
+        return HS_INVALID;
+    }
+
+    hs_error_t ret = validDatabase(db);
+    if (unlikely(ret != HS_SUCCESS)) {
+        return ret;
+    }
+
+    *size = sizeof(struct hs_database) + db->length;
+    return HS_SUCCESS;
+}
+
+HS_PUBLIC_API
+hs_error_t HS_CDECL hs_serialized_database_size(const char *bytes,
+                                                const size_t length,
+                                                size_t *size) {
+    // Decode and check the header
+    hs_database_t header;
+    hs_error_t ret = db_decode_header(&bytes, length, &header);
+    if (ret != HS_SUCCESS) {
+        return ret;
+    }
+
+    if (!size) {
+        return HS_INVALID;
+    }
+
+    *size = sizeof(struct hs_database) + header.length;
+    return HS_SUCCESS;
+}
+
+hs_error_t dbIsValid(const hs_database_t *db) {
+    if (db->magic != HS_DB_MAGIC) {
+        DEBUG_PRINTF("bad magic\n");
+        return HS_INVALID;
+    }
+
+    if (db->version != HS_DB_VERSION) {
+        DEBUG_PRINTF("bad version\n");
+        return HS_DB_VERSION_ERROR;
+    }
+
+    if (db_check_platform(db->platform) != HS_SUCCESS) {
+        DEBUG_PRINTF("bad platform\n");
+        return HS_DB_PLATFORM_ERROR;
+    }
+
+    if (!ISALIGNED_16(hs_get_bytecode(db))) {
+        DEBUG_PRINTF("bad alignment\n");
+        return HS_INVALID;
+    }
+
+    hs_error_t rv = db_check_crc(db);
+    if (rv != HS_SUCCESS) {
+        DEBUG_PRINTF("bad crc\n");
+        return rv;
+    }
+
+    return HS_SUCCESS;
+}
+
+#if defined(_WIN32)
+#define SNPRINTF_COMPAT _snprintf
+#else
+#define SNPRINTF_COMPAT snprintf
+#endif
+
+/** Allocate a buffer and prints the database info into it. Returns an
+ * appropriate error code on failure, or HS_SUCCESS on success. */
+static
+hs_error_t print_database_string(char **s, u32 version, const platform_t plat,
+                                 u32 raw_mode) {
+    assert(s);
+    *s = NULL;
+
+    u8 release = (version >> 8) & 0xff;
+    u8 minor = (version >> 16) & 0xff;
+    u8 major = (version >> 24) & 0xff;
+
+    const char *features = (plat & HS_PLATFORM_NOAVX512VBMI)
+                               ? (plat & HS_PLATFORM_NOAVX512)
+                                   ? (plat & HS_PLATFORM_NOAVX2) ? "" : "AVX2"
+                                   : "AVX512"
+                               : "AVX512VBMI";
+
+    const char *mode = NULL;
+
+    if (raw_mode == HS_MODE_STREAM) {
+        mode = "STREAM";
+    } else if (raw_mode == HS_MODE_VECTORED) {
+        mode = "VECTORED";
+    } else {
+        assert(raw_mode == HS_MODE_BLOCK);
+        mode = "BLOCK";
+    }
+
+    // Initial allocation size, which should be large enough to print our info.
+    // If it isn't, snprintf will tell us and we can resize appropriately.
+    size_t len = 256;
+
+    while (1) {
+        char *buf = hs_misc_alloc(len);
+        hs_error_t ret = hs_check_alloc(buf);
+        if (ret != HS_SUCCESS) {
+            hs_misc_free(buf);
+            return ret;
+        }
+
+        // Note: SNPRINTF_COMPAT is a macro defined above, to cope with systems
+        // that don't have snprintf but have a workalike.
+        int p_len = SNPRINTF_COMPAT(
+            buf, len, "Version: %u.%u.%u Features: %s Mode: %s",
+            major, minor, release, features, mode);
+        if (p_len < 0) {
+            DEBUG_PRINTF("snprintf output error, returned %d\n", p_len);
+            hs_misc_free(buf);
+            break;
+        } else if ((size_t)p_len < len) { // output fit within buffer.
+            assert(buf[p_len] == '\0');
+            *s = buf;
+            return HS_SUCCESS;
+        } else { // output didn't fit: resize and reallocate.
+            len = (size_t)p_len + 1; // must add one for null terminator.
+            hs_misc_free(buf);
+        }
+    }
+
+    return HS_NOMEM;
+}
+
+HS_PUBLIC_API
+hs_error_t HS_CDECL hs_serialized_database_info(const char *bytes,
+                                                size_t length, char **info) {
+    if (!info) {
+        return HS_INVALID;
+    }
+    *info = NULL;
+
+    // Decode and check the header
+    hs_database_t header;
+    hs_error_t ret = db_decode_header(&bytes, length, &header);
+    if (ret != HS_SUCCESS) {
+        return ret;
+    }
+
+    u32 mode = unaligned_load_u32(bytes + offsetof(struct RoseEngine, mode));
+
+    return print_database_string(info, header.version, header.platform, mode);
+}
+
+HS_PUBLIC_API
+hs_error_t HS_CDECL hs_database_info(const hs_database_t *db, char **info) {
+    if (!info) {
+        return HS_INVALID;
+    }
+    *info = NULL;
+
+    if (!db || !db_correctly_aligned(db) || db->magic != HS_DB_MAGIC) {
+        return HS_INVALID;
+    }
+
+    platform_t plat;
+    plat = db->platform;
+
+    const struct RoseEngine *rose = hs_get_bytecode(db);
+
+    return print_database_string(info, db->version, plat, rose->mode);
+}
diff --git a/regex/database.h b/regex/database.h
new file mode 100644
index 000000000..f122f97be
--- /dev/null
+++ b/regex/database.h
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2015-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Runtime code for hs_database manipulation.
+ */
+
+#ifndef DATABASE_H_D467FD6F343DDE
+#define DATABASE_H_D467FD6F343DDE
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#include "hs_compile.h" // for HS_MODE_ flags
+#include "hs_version.h"
+#include "ue2common.h"
+#include "util/arch.h"
+
+#define HS_DB_VERSION HS_VERSION_32BIT
+#define HS_DB_MAGIC   (0xdbdbdbdbU)
+
+// Values in here cannot (easily) change - add new ones!
+
+// CPU type is the low 6 bits (we can't need more than 64, surely!)
+
+#define HS_PLATFORM_INTEL           1
+#define HS_PLATFORM_CPU_MASK        0x3F
+
+#define HS_PLATFORM_NOAVX2          (4<<13)
+#define HS_PLATFORM_NOAVX512        (8<<13)
+#define HS_PLATFORM_NOAVX512VBMI    (0x10<<13)
+
+/** \brief Platform features bitmask. */
+typedef u64a platform_t;
+
+static UNUSED
+const platform_t hs_current_platform = {
+#if !defined(HAVE_AVX2)
+    HS_PLATFORM_NOAVX2 |
+#endif
+#if !defined(HAVE_AVX512)
+    HS_PLATFORM_NOAVX512 |
+#endif
+#if !defined(HAVE_AVX512VBMI)
+    HS_PLATFORM_NOAVX512VBMI |
+#endif
+    0,
+};
+
+static UNUSED
+const platform_t hs_current_platform_no_avx2 = {
+    HS_PLATFORM_NOAVX2 |
+    HS_PLATFORM_NOAVX512 |
+    HS_PLATFORM_NOAVX512VBMI |
+    0,
+};
+
+static UNUSED
+const platform_t hs_current_platform_no_avx512 = {
+    HS_PLATFORM_NOAVX512 |
+    HS_PLATFORM_NOAVX512VBMI |
+    0,
+};
+
+static UNUSED
+const platform_t hs_current_platform_no_avx512vbmi = {
+    HS_PLATFORM_NOAVX512VBMI |
+    0,
+};
+
+/*
+ * a header to enclose the actual bytecode - useful for keeping info about the
+ * compiled data.
+ */
+struct hs_database {
+    u32 magic;
+    u32 version;
+    u32 length;
+    u64a platform;
+    u32 crc32;
+    u32 reserved0;
+    u32 reserved1;
+    u32 bytecode;    // offset relative to db start
+    u32 padding[16];
+    char bytes[];
+};
+
+static really_inline
+const void *hs_get_bytecode(const struct hs_database *db) {
+    return ((const char *)db + db->bytecode);
+}
+
+/**
+ * Cheap database sanity checks used in block mode scan calls and streaming
+ * mode open calls.
+ */
+static really_inline
+hs_error_t validDatabase(const hs_database_t *db) {
+    if (!db || db->magic != HS_DB_MAGIC) {
+        return HS_INVALID;
+    }
+    if (db->version != HS_DB_VERSION) {
+        return HS_DB_VERSION_ERROR;
+    }
+
+    return HS_SUCCESS;
+}
+
+hs_error_t dbIsValid(const struct hs_database *db);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* DATABASE_H_D467FD6F343DDE */
diff --git a/regex/dkms.conf b/regex/dkms.conf
new file mode 100644
index 000000000..ea75d2553
--- /dev/null
+++ b/regex/dkms.conf
@@ -0,0 +1,8 @@
+PACKAGE_NAME="linux-rex"
+PACKAGE_VERSION="0.1"
+BUILD_EXCLUSIVE_ARCH=x86_64
+MAKE[0]="./build.sh ${kernel_source_dir} M=${dkms_tree}/${PACKAGE_NAME}/${PACKAGE_VERSION}/build -j${parallel_jobs}"
+CLEAN="make -C ${kernel_source_dir} M=${dkms_tree}/${PACKAGE_NAME}/${PACKAGE_VERSION}/build clean"
+AUTOINSTALL=yes
+BUILT_MODULE_NAME[0]="xdp_rex"
+DEST_MODULE_LOCATION[0]=/extra
diff --git a/regex/fdr/fdr.c b/regex/fdr/fdr.c
new file mode 100644
index 000000000..d33756d35
--- /dev/null
+++ b/regex/fdr/fdr.c
@@ -0,0 +1,881 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "fdr.h"
+#include "fdr_confirm.h"
+#include "fdr_confirm_runtime.h"
+#include "fdr_internal.h"
+#include "fdr_loadval.h"
+#include "flood_runtime.h"
+#include "scratch.h"
+#include "teddy.h"
+#include "teddy_internal.h"
+#include "util/arch.h"
+#include "util/simd_utils.h"
+#include "util/uniform_ops.h"
+
+/** \brief number of bytes processed in each iteration */
+#define ITER_BYTES          16
+
+/** \brief total zone buffer size */
+#define ZONE_TOTAL_SIZE     64
+
+/** \brief maximum number of allowed zones */
+#define ZONE_MAX            3
+
+/** \brief zone information.
+ *
+ * Zone represents a region of data to scan in FDR.
+ *
+ * The incoming buffer is to split in multiple zones to ensure two properties:
+ * 1: that we can read 8? bytes behind to generate a hash safely
+ * 2: that we can read the 3 byte after the current byte (domain > 8)
+ */
+struct zone {
+    /** \brief copied buffer, used only when it is a boundary zone. */
+    u8 ALIGN_CL_DIRECTIVE buf[ZONE_TOTAL_SIZE];
+
+    /** \brief shift amount for fdr state to avoid unwanted match. */
+    u8 shift;
+
+    /** \brief if boundary zone, start points into the zone buffer after the
+     * pre-padding. Otherwise, points to the main buffer, appropriately. */
+    const u8 *start;
+
+    /** \brief if boundary zone, end points to the end of zone. Otherwise,
+     * pointer to the main buffer, appropriately. */
+    const u8 *end;
+
+    /** \brief the amount to adjust to go from a pointer in the zones region
+     * (between start and end) to a pointer in the original data buffer. */
+    ptrdiff_t zone_pointer_adjust;
+
+    /** \brief firstFloodDetect from FDR_Runtime_Args for non-boundary zones,
+     * otherwise end of the zone buf. floodPtr always points inside the same
+     * buffer as the start pointe. */
+    const u8 *floodPtr;
+};
+
+static
+const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = {
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+    { 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+    { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+    { 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+    { 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00 },
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }
+};
+
+/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
+ * so we force its generation.
+ */
+static really_inline
+u64a andn(const u32 a, const u8 *b) {
+    u64a r;
+#if defined(HAVE_BMI) && !defined(NO_ASM)
+    __asm__ ("andn\t%2,%1,%k0" : "=r"(r) : "r"(a), "m"(*(const u32 *)b));
+#else
+    r = unaligned_load_u32(b) & ~a;
+#endif
+    return r;
+}
+
+/* generates an initial state mask based on the last byte-ish of history rather
+ * than being all accepting. If there is no history to consider, the state is
+ * generated based on the minimum length of each bucket in order to prevent
+ * confirms.
+ */
+static really_inline
+m128 getInitState(const struct FDR *fdr, u8 len_history, const u64a *ft,
+                  const struct zone *z) {
+    m128 s;
+    if (len_history) {
+        /* +1: the zones ensure that we can read the byte at z->end */
+        u32 tmp = lv_u16(z->start + z->shift - 1, z->buf, z->end + 1);
+        tmp &= fdr->domainMask;
+        s = load_m128_from_u64a(ft + tmp);
+        s = rshiftbyte_m128(s, 1);
+    } else {
+        s = fdr->start;
+    }
+    return s;
+}
+
+static really_inline
+void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
+                       UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
+                       const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
+    /* +1: the zones ensure that we can read the byte at z->end */
+    assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
+    u64a reach0 = andn(domain_mask_flipped, itPtr);
+    u64a reach1 = andn(domain_mask_flipped, itPtr + 1);
+    u64a reach2 = andn(domain_mask_flipped, itPtr + 2);
+    u64a reach3 = andn(domain_mask_flipped, itPtr + 3);
+
+    m128 st0 = load_m128_from_u64a(ft + reach0);
+    m128 st1 = load_m128_from_u64a(ft + reach1);
+    m128 st2 = load_m128_from_u64a(ft + reach2);
+    m128 st3 = load_m128_from_u64a(ft + reach3);
+
+    u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
+    u64a reach5 = andn(domain_mask_flipped, itPtr + 5);
+    u64a reach6 = andn(domain_mask_flipped, itPtr + 6);
+    u64a reach7 = andn(domain_mask_flipped, itPtr + 7);
+
+    m128 st4 = load_m128_from_u64a(ft + reach4);
+    m128 st5 = load_m128_from_u64a(ft + reach5);
+    m128 st6 = load_m128_from_u64a(ft + reach6);
+    m128 st7 = load_m128_from_u64a(ft + reach7);
+
+    st1 = lshiftbyte_m128(st1, 1);
+    st2 = lshiftbyte_m128(st2, 2);
+    st3 = lshiftbyte_m128(st3, 3);
+    st4 = lshiftbyte_m128(st4, 4);
+    st5 = lshiftbyte_m128(st5, 5);
+    st6 = lshiftbyte_m128(st6, 6);
+    st7 = lshiftbyte_m128(st7, 7);
+
+    st0 = or128(st0, st1);
+    st2 = or128(st2, st3);
+    st4 = or128(st4, st5);
+    st6 = or128(st6, st7);
+    st0 = or128(st0, st2);
+    st4 = or128(st4, st6);
+    st0 = or128(st0, st4);
+    *s = or128(*s, st0);
+
+    *conf0 = movq(*s);
+    *s = rshiftbyte_m128(*s, 8);
+    *conf0 ^= ~0ULL;
+
+    u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
+    u64a reach9 = andn(domain_mask_flipped, itPtr + 9);
+    u64a reach10 = andn(domain_mask_flipped, itPtr + 10);
+    u64a reach11 = andn(domain_mask_flipped, itPtr + 11);
+
+    m128 st8 = load_m128_from_u64a(ft + reach8);
+    m128 st9 = load_m128_from_u64a(ft + reach9);
+    m128 st10 = load_m128_from_u64a(ft + reach10);
+    m128 st11 = load_m128_from_u64a(ft + reach11);
+
+    u64a reach12 = andn(domain_mask_flipped, itPtr + 12);
+    u64a reach13 = andn(domain_mask_flipped, itPtr + 13);
+    u64a reach14 = andn(domain_mask_flipped, itPtr + 14);
+    u64a reach15 = andn(domain_mask_flipped, itPtr + 15);
+
+    m128 st12 = load_m128_from_u64a(ft + reach12);
+    m128 st13 = load_m128_from_u64a(ft + reach13);
+    m128 st14 = load_m128_from_u64a(ft + reach14);
+    m128 st15 = load_m128_from_u64a(ft + reach15);
+
+    st9 = lshiftbyte_m128(st9, 1);
+    st10 = lshiftbyte_m128(st10, 2);
+    st11 = lshiftbyte_m128(st11, 3);
+    st12 = lshiftbyte_m128(st12, 4);
+    st13 = lshiftbyte_m128(st13, 5);
+    st14 = lshiftbyte_m128(st14, 6);
+    st15 = lshiftbyte_m128(st15, 7);
+
+    st8 = or128(st8, st9);
+    st10 = or128(st10, st11);
+    st12 = or128(st12, st13);
+    st14 = or128(st14, st15);
+    st8 = or128(st8, st10);
+    st12 = or128(st12, st14);
+    st8 = or128(st8, st12);
+    *s = or128(*s, st8);
+
+    *conf8 = movq(*s);
+    *s = rshiftbyte_m128(*s, 8);
+    *conf8 ^= ~0ULL;
+}
+
+static really_inline
+void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr,
+                       UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
+                       const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
+    assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
+    u64a reach0 = andn(domain_mask_flipped, itPtr);
+    u64a reach2 = andn(domain_mask_flipped, itPtr + 2);
+    u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
+    u64a reach6 = andn(domain_mask_flipped, itPtr + 6);
+
+    m128 st0 = load_m128_from_u64a(ft + reach0);
+    m128 st2 = load_m128_from_u64a(ft + reach2);
+    m128 st4 = load_m128_from_u64a(ft + reach4);
+    m128 st6 = load_m128_from_u64a(ft + reach6);
+
+    u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
+    u64a reach10 = andn(domain_mask_flipped, itPtr + 10);
+    u64a reach12 = andn(domain_mask_flipped, itPtr + 12);
+    u64a reach14 = andn(domain_mask_flipped, itPtr + 14);
+
+    m128 st8 = load_m128_from_u64a(ft + reach8);
+    m128 st10 = load_m128_from_u64a(ft + reach10);
+    m128 st12 = load_m128_from_u64a(ft + reach12);
+    m128 st14 = load_m128_from_u64a(ft + reach14);
+
+    st2  = lshiftbyte_m128(st2, 2);
+    st4  = lshiftbyte_m128(st4, 4);
+    st6  = lshiftbyte_m128(st6, 6);
+
+    *s = or128(*s, st0);
+    *s = or128(*s, st2);
+    *s = or128(*s, st4);
+    *s = or128(*s, st6);
+
+    *conf0 = movq(*s);
+    *s = rshiftbyte_m128(*s, 8);
+    *conf0 ^= ~0ULL;
+
+    st10 = lshiftbyte_m128(st10, 2);
+    st12 = lshiftbyte_m128(st12, 4);
+    st14 = lshiftbyte_m128(st14, 6);
+
+    *s = or128(*s, st8);
+    *s = or128(*s, st10);
+    *s = or128(*s, st12);
+    *s = or128(*s, st14);
+
+    *conf8 = movq(*s);
+    *s = rshiftbyte_m128(*s, 8);
+    *conf8 ^= ~0ULL;
+}
+
+static really_inline
+void get_conf_stride_4(const u8 *itPtr, UNUSED const u8 *start_ptr,
+                       UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
+                       const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
+    assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
+    u64a reach0 = andn(domain_mask_flipped, itPtr);
+    u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
+    u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
+    u64a reach12 = andn(domain_mask_flipped, itPtr + 12);
+
+    m128 st0 = load_m128_from_u64a(ft + reach0);
+    m128 st4 = load_m128_from_u64a(ft + reach4);
+    m128 st8 = load_m128_from_u64a(ft + reach8);
+    m128 st12 = load_m128_from_u64a(ft + reach12);
+
+    st4 = lshiftbyte_m128(st4, 4);
+    st12 = lshiftbyte_m128(st12, 4);
+
+    *s = or128(*s, st0);
+    *s = or128(*s, st4);
+    *conf0 = movq(*s);
+    *s = rshiftbyte_m128(*s, 8);
+    *conf0 ^= ~0ULL;
+
+    *s = or128(*s, st8);
+    *s = or128(*s, st12);
+    *conf8 = movq(*s);
+    *s = rshiftbyte_m128(*s, 8);
+    *conf8 ^= ~0ULL;
+}
+
+static really_inline
+void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
+                    const u32 *confBase, const struct FDR_Runtime_Args *a,
+                    const u8 *ptr, u32 *last_match_id, struct zone *z) {
+    const u8 bucket = 8;
+
+    if (likely(!*conf)) {
+        return;
+    }
+
+    /* ptr is currently referring to a location in the zone's buffer, we also
+     * need a pointer in the original, main buffer for the final string compare.
+     */
+    const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust);
+
+    const u8 *confLoc = ptr;
+
+    do  {
+        u32 bit = findAndClearLSB_64(conf);
+        u32 byte = bit / bucket + offset;
+        u32 bitRem = bit % bucket;
+        u32 idx = bitRem;
+        u32 cf = confBase[idx];
+        if (!cf) {
+            continue;
+        }
+        const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
+                                        ((const u8 *)confBase + cf);
+        if (!(fdrc->groups & *control)) {
+            continue;
+        }
+        u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a) + 1);
+        confWithBit(fdrc, a, ptr_main - a->buf + byte, control,
+                    last_match_id, confVal, conf, bit);
+    } while (unlikely(!!*conf));
+}
+
+static really_inline
+void dumpZoneInfo(UNUSED struct zone *z, UNUSED size_t zone_id) {
+#ifdef DEBUG
+    DEBUG_PRINTF("zone: zone=%zu, bufPtr=%p\n", zone_id, z->buf);
+    DEBUG_PRINTF("zone: startPtr=%p, endPtr=%p, shift=%u\n",
+                 z->start, z->end, z->shift);
+    DEBUG_PRINTF("zone: zone_pointer_adjust=%zd, floodPtr=%p\n",
+                 z->zone_pointer_adjust, z->floodPtr);
+    DEBUG_PRINTF("zone buf:");
+    for (size_t i = 0; i < ZONE_TOTAL_SIZE; i++) {
+        if (i % 8 == 0) {
+            printf("_");
+        }
+        if (z->buf[i]) {
+            printf("%02x", z->buf[i]);
+        } else {
+            printf("..");
+        }
+    }
+    printf("\n");
+#endif
+};
+
+/**
+ * \brief Updates attributes for non-boundary region zone.
+ */
+static really_inline
+void createMainZone(const u8 *flood, const u8 *begin, const u8 *end,
+                    struct zone *z) {
+    z->zone_pointer_adjust = 0; /* zone buffer is the main buffer */
+    z->start = begin;
+    z->end = end;
+    z->floodPtr = flood;
+    z->shift = 0;
+}
+
+/**
+ * \brief Create zone for short cases (<= ITER_BYTES).
+ *
+ * For this case we need to copy everything into the zone's internal buffer.
+ *
+ * We need to ensure that we run over real data if it exists (in history or
+ * before zone begin). We also need to ensure 8 bytes before any data being
+ * matched can be read (to perform a conf hash).
+ *
+ * We also need to ensure that the data at z->end can be read.
+ *
+ * Hence, the zone consists of:
+ *     16 bytes of history,
+ *     1 - 24 bytes of data form the buffer (ending at end),
+ *     1 byte of final padding
+ */
+static really_inline
+void createShortZone(const u8 *buf, const u8 *hend, const u8 *begin,
+                     const u8 *end, struct zone *z) {
+    /* the floodPtr for BOUNDARY zones are maximum of end of zone buf to avoid
+     * the checks in boundary zone. */
+    z->floodPtr = z->buf + ZONE_TOTAL_SIZE;
+
+    ptrdiff_t z_len = end - begin;
+    assert(z_len > 0);
+    assert(z_len <= ITER_BYTES);
+
+    z->shift = ITER_BYTES - z_len; /* ignore bytes outside region specified */
+
+    static const size_t ZONE_SHORT_DATA_OFFSET = 16; /* after history */
+
+    /* we are guaranteed to always have 16 initialised bytes at the end of
+     * the history buffer (they may be garbage coming from the stream state
+     * preceding hbuf, but bytes that don't correspond to actual history
+     * shouldn't affect computations). */
+    *(m128 *)z->buf = loadu128(hend - sizeof(m128));
+
+    /* The amount of data we have to copy from main buffer. */
+    size_t copy_len = MIN((size_t)(end - buf),
+                          ITER_BYTES + sizeof(CONF_TYPE));
+
+    u8 *zone_data = z->buf + ZONE_SHORT_DATA_OFFSET;
+    switch (copy_len) {
+    case 1:
+        *zone_data = *(end - 1);
+        break;
+    case 2:
+        *(u16 *)zone_data = unaligned_load_u16(end - 2);
+        break;
+    case 3:
+        *(u16 *)zone_data = unaligned_load_u16(end - 3);
+        *(zone_data + 2) = *(end - 1);
+        break;
+    case 4:
+        *(u32 *)zone_data = unaligned_load_u32(end - 4);
+        break;
+    case 5:
+    case 6:
+    case 7:
+        /* perform copy with 2 overlapping 4-byte chunks from buf. */
+        *(u32 *)zone_data = unaligned_load_u32(end - copy_len);
+        unaligned_store_u32(zone_data + copy_len - sizeof(u32),
+                            unaligned_load_u32(end - sizeof(u32)));
+        break;
+    case 8:
+        *(u64a *)zone_data = unaligned_load_u64a(end - 8);
+        break;
+    case 9:
+    case 10:
+    case 11:
+    case 12:
+    case 13:
+    case 14:
+    case 15:
+        /* perform copy with 2 overlapping 8-byte chunks from buf. */
+        *(u64a *)zone_data = unaligned_load_u64a(end - copy_len);
+        unaligned_store_u64a(zone_data + copy_len - sizeof(u64a),
+                             unaligned_load_u64a(end - sizeof(u64a)));
+        break;
+    case 16:
+        /* copy 16-bytes from buf. */
+        *(m128 *)zone_data = loadu128(end - 16);
+        break;
+    default:
+        assert(copy_len <= sizeof(m128) + sizeof(u64a));
+
+        /* perform copy with (potentially overlapping) 8-byte and 16-byte chunks.
+         */
+        *(u64a *)zone_data = unaligned_load_u64a(end - copy_len);
+        storeu128(zone_data + copy_len - sizeof(m128),
+                  loadu128(end - sizeof(m128)));
+        break;
+    }
+
+    /* set the start and end location of the zone buf
+     * to be scanned */
+    u8 *z_end = z->buf + ZONE_SHORT_DATA_OFFSET + copy_len;
+    assert(ZONE_SHORT_DATA_OFFSET + copy_len >= ITER_BYTES);
+
+    /* copy the post-padding byte; this is required for domain > 8 due to
+     * overhang */
+    assert(ZONE_SHORT_DATA_OFFSET + copy_len + 3 < 64);
+    *z_end = 0;
+
+    z->end = z_end;
+    z->start = z_end - ITER_BYTES;
+    z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end);
+    assert(z->start + z->shift == z_end - z_len);
+}
+
+/**
+ * \brief Create a zone for the start region.
+ *
+ * This function requires that there is > ITER_BYTES of data in the buffer to
+ * scan. The start zone itself is always responsible for scanning exactly
+ * ITER_BYTES of data - there are no warmup/junk bytes scanned.
+ *
+ * This zone ensures that the byte at z->end can be read and corresponds to
+ * the next byte of data.
+ *
+ * 8 bytes of history data are provided before z->start to allow proper hash
+ * generation in streaming mode. If buf != begin, upto 8 bytes of data
+ * prior to begin is also provided.
+ *
+ * Although we are not interested in bare literals which start before begin
+ * if buf != begin, lookarounds associated with the literal may require
+ * the data prior to begin for hash purposes.
+ */
+static really_inline
+void createStartZone(const u8 *buf, const u8 *hend, const u8 *begin,
+                     struct zone *z) {
+    assert(ITER_BYTES == sizeof(m128));
+    assert(sizeof(CONF_TYPE) == 8);
+    static const size_t ZONE_START_BEGIN = sizeof(CONF_TYPE);
+
+    const u8 *end = begin + ITER_BYTES;
+
+    /* set floodPtr to the end of zone buf to avoid checks in start zone */
+    z->floodPtr = z->buf + ZONE_TOTAL_SIZE;
+
+    z->shift = 0; /* we are processing ITER_BYTES of real data */
+
+    /* we are guaranteed to always have 16 initialised bytes at the end of the
+     * history buffer (they may be garbage coming from the stream state
+     * preceding hbuf, but bytes that don't correspond to actual history
+     * shouldn't affect computations). However, for start zones, history is only
+     * required for conf hash purposes so we only need 8 bytes */
+    unaligned_store_u64a(z->buf, unaligned_load_u64a(hend - sizeof(u64a)));
+
+    /* The amount of data we have to copy from main buffer. */
+    size_t copy_len = MIN((size_t)(end - buf),
+                          ITER_BYTES + sizeof(CONF_TYPE));
+    assert(copy_len >= 16);
+
+    /* copy the post-padding byte; this is required for domain > 8 due to
+     * overhang. The start requires that there is data after the zone so it
+     * it safe to dereference end */
+    z->buf[ZONE_START_BEGIN + copy_len] = *end;
+
+    /* set the start and end location of the zone buf to be scanned */
+    u8 *z_end = z->buf + ZONE_START_BEGIN + copy_len;
+    z->end = z_end;
+    z->start = z_end - ITER_BYTES;
+
+    /* copy the first 8 bytes of the valid region */
+    unaligned_store_u64a(z->buf + ZONE_START_BEGIN,
+                         unaligned_load_u64a(end - copy_len));
+
+    /* copy the last 16 bytes, may overlap with the previous 8 byte write */
+    storeu128(z_end - sizeof(m128), loadu128(end - sizeof(m128)));
+
+    z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end);
+
+    assert(ZONE_START_BEGIN + copy_len + 3 < 64);
+}
+
+/**
+ * \brief Create a zone for the end region.
+ *
+ * This function requires that there is > ITER_BYTES of data in the buffer to
+ * scan. The end zone is responsible for a scanning the <= ITER_BYTES rump of
+ * data and optional ITER_BYTES. The main zone cannot handle the last 3 bytes
+ * of the buffer. The end zone is required to handle an optional full
+ * ITER_BYTES from main zone when there are less than 3 bytes to scan. The
+ * main zone size is reduced by ITER_BYTES in this case.
+ *
+ * This zone ensures that the byte at z->end can be read by filling it with a
+ * padding character.
+ *
+ * Upto 8 bytes of data prior to begin is also provided for the purposes of
+ * generating hashes. History is not copied, as all locations which require
+ * history for generating a hash are the responsiblity of the start zone.
+ */
+static really_inline
+void createEndZone(const u8 *buf, const u8 *begin, const u8 *end,
+                   struct zone *z) {
+    /* the floodPtr for BOUNDARY zones are maximum of end of zone buf to avoid
+     * the checks in boundary zone. */
+    z->floodPtr = z->buf + ZONE_TOTAL_SIZE;
+
+    ptrdiff_t z_len = end - begin;
+    assert(z_len > 0);
+    size_t iter_bytes_second = 0;
+    size_t z_len_first = z_len;
+    if (z_len > ITER_BYTES) {
+        z_len_first = z_len - ITER_BYTES;
+        iter_bytes_second = ITER_BYTES;
+    }
+    z->shift = ITER_BYTES - z_len_first;
+
+    const u8 *end_first = end - iter_bytes_second;
+    /* The amount of data we have to copy from main buffer for the
+     * first iteration. */
+    size_t copy_len_first = MIN((size_t)(end_first - buf),
+                                ITER_BYTES + sizeof(CONF_TYPE));
+    assert(copy_len_first >= 16);
+
+    size_t total_copy_len = copy_len_first + iter_bytes_second;
+    assert(total_copy_len + 3 < 64);
+
+    /* copy the post-padding byte; this is required for domain > 8 due to
+     * overhang */
+    z->buf[total_copy_len] = 0;
+
+    /* set the start and end location of the zone buf
+     * to be scanned */
+    u8 *z_end = z->buf + total_copy_len;
+    z->end = z_end;
+    z->start = z_end - ITER_BYTES - iter_bytes_second;
+    assert(z->start + z->shift == z_end - z_len);
+
+    u8 *z_end_first = z_end - iter_bytes_second;
+    /* copy the first 8 bytes of the valid region */
+    unaligned_store_u64a(z->buf,
+                         unaligned_load_u64a(end_first - copy_len_first));
+
+    /* copy the last 16 bytes, may overlap with the previous 8 byte write */
+    storeu128(z_end_first - sizeof(m128), loadu128(end_first - sizeof(m128)));
+    if (iter_bytes_second) {
+        storeu128(z_end - sizeof(m128), loadu128(end - sizeof(m128)));
+    }
+
+    z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end);
+}
+
+/**
+ * \brief Prepare zones.
+ *
+ * This function prepares zones with actual buffer and some padded bytes.
+ * The actual ITER_BYTES bytes in zone is preceded by main buf and/or
+ * history buf and succeeded by padded bytes possibly from main buf,
+ * if available.
+ */
+static really_inline
+size_t prepareZones(const u8 *buf, size_t len, const u8 *hend,
+                    size_t start, const u8 *flood, struct zone *zoneArr) {
+    const u8 *ptr = buf + start;
+    size_t remaining = len - start;
+
+    if (remaining <= ITER_BYTES) {
+        /* enough bytes to make only one zone */
+        createShortZone(buf, hend, ptr, buf + len, &zoneArr[0]);
+        return 1;
+    }
+
+    /* enough bytes to make more than one zone */
+
+    size_t numZone = 0;
+    createStartZone(buf, hend, ptr, &zoneArr[numZone++]);
+    ptr += ITER_BYTES;
+
+    assert(ptr < buf + len);
+
+    /* find maximum buffer location that the main zone can scan
+     * - must be a multiple of ITER_BYTES, and
+     * - cannot contain the last 3 bytes (due to 3 bytes read behind the
+         end of buffer in FDR main loop)
+     */
+    const u8 *main_end = buf + start + ROUNDDOWN_N(len - start - 3, ITER_BYTES);
+
+    /* create a zone if multiple of ITER_BYTES are found */
+    if (main_end > ptr) {
+        createMainZone(flood, ptr, main_end, &zoneArr[numZone++]);
+        ptr = main_end;
+    }
+    /* create a zone with rest of the data from the main buffer */
+    createEndZone(buf, ptr, buf + len, &zoneArr[numZone++]);
+    return numZone;
+}
+
+#define INVALID_MATCH_ID (~0U)
+
+#define FDR_MAIN_LOOP(zz, s, get_conf_fn)                                   \
+    do {                                                                    \
+        const u8 *tryFloodDetect = zz->floodPtr;                            \
+        const u8 *start_ptr = zz->start;                                    \
+        const u8 *end_ptr = zz->end;                                        \
+                                                                            \
+        for (const u8 *itPtr = start_ptr; itPtr + ITER_BYTES <= end_ptr;    \
+            itPtr += ITER_BYTES) {                                          \
+            if (unlikely(itPtr > tryFloodDetect)) {                         \
+                tryFloodDetect = floodDetect(fdr, a, &itPtr, tryFloodDetect,\
+                                             &floodBackoff, &control,       \
+                                             ITER_BYTES);                   \
+                if (unlikely(control == HWLM_TERMINATE_MATCHING)) {         \
+                    return HWLM_TERMINATED;                                 \
+                }                                                           \
+            }                                                               \
+            __builtin_prefetch(itPtr + ITER_BYTES);                         \
+            u64a conf0;                                                     \
+            u64a conf8;                                                     \
+            get_conf_fn(itPtr, start_ptr, end_ptr, domain_mask_flipped,     \
+                        ft, &conf0, &conf8, &s);                            \
+            do_confirm_fdr(&conf0, 0, &control, confBase, a, itPtr,         \
+                           &last_match_id, zz);                             \
+            do_confirm_fdr(&conf8, 8, &control, confBase, a, itPtr,         \
+                           &last_match_id, zz);                             \
+            if (unlikely(control == HWLM_TERMINATE_MATCHING)) {             \
+                return HWLM_TERMINATED;                                     \
+            }                                                               \
+        } /* end for loop */                                                \
+    } while (0)                                                             \
+
+static never_inline
+hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
+                             const struct FDR_Runtime_Args *a,
+                             hwlm_group_t control) {
+    assert(ISALIGNED_CL(fdr));
+
+    u32 floodBackoff = FLOOD_BACKOFF_START;
+    u32 last_match_id = INVALID_MATCH_ID;
+    u32 domain_mask_flipped = ~fdr->domainMask;
+    u8 stride = fdr->stride;
+    const u64a *ft =
+        (const u64a *)((const u8 *)fdr + ROUNDUP_CL(sizeof(struct FDR)));
+    assert(ISALIGNED_CL(ft));
+    const u32 *confBase = (const u32 *)((const u8 *)fdr + fdr->confOffset);
+    assert(ISALIGNED_CL(confBase));
+    struct zone zones[ZONE_MAX];
+    assert(fdr->domain > 8 && fdr->domain < 16);
+
+    size_t numZone = prepareZones(a->buf, a->len,
+                                  a->buf_history + a->len_history,
+                                  a->start_offset, a->firstFloodDetect, zones);
+    assert(numZone <= ZONE_MAX);
+    m128 state = getInitState(fdr, a->len_history, ft, &zones[0]);
+
+    for (size_t curZone = 0; curZone < numZone; curZone++) {
+        struct zone *z = &zones[curZone];
+        dumpZoneInfo(z, curZone);
+
+        /* When a zone contains less data than is processed in an iteration
+         * of FDR_MAIN_LOOP(), we need to scan over some extra data.
+         *
+         * We have chosen to scan this extra data at the start of the
+         * iteration. The extra data is either data we have already scanned or
+         * garbage (if it is earlier than offset 0),
+         *
+         * As a result we need to shift the incoming state back so that it will
+         * properly line up with the data being scanned.
+         *
+         * We also need to forbid reporting any matches in the data being
+         * rescanned as they have already been reported (or are over garbage but
+         * later stages should also provide that safety guarantee).
+         */
+
+        u8 shift = z->shift;
+
+        state = variable_byte_shift_m128(state, shift);
+
+        state = or128(state, load128(zone_or_mask[shift]));
+
+        switch (stride) {
+        case 1:
+            FDR_MAIN_LOOP(z, state, get_conf_stride_1);
+            break;
+        case 2:
+            FDR_MAIN_LOOP(z, state, get_conf_stride_2);
+            break;
+        case 4:
+            FDR_MAIN_LOOP(z, state, get_conf_stride_4);
+            break;
+        default:
+            break;
+        }
+    }
+
+    return HWLM_SUCCESS;
+}
+
+#if defined(HAVE_AVX2)
+#define ONLY_AVX2(func) func
+#else
+#define ONLY_AVX2(func) NULL
+#endif
+
+typedef hwlm_error_t (*FDRFUNCTYPE)(const struct FDR *fdr,
+                                    const struct FDR_Runtime_Args *a,
+                                    hwlm_group_t control);
+
+static const FDRFUNCTYPE funcs[] = {
+    fdr_engine_exec,
+    NULL, /* old: fast teddy */
+    NULL, /* old: fast teddy */
+    ONLY_AVX2(fdr_exec_fat_teddy_msks1),
+    ONLY_AVX2(fdr_exec_fat_teddy_msks1_pck),
+    ONLY_AVX2(fdr_exec_fat_teddy_msks2),
+    ONLY_AVX2(fdr_exec_fat_teddy_msks2_pck),
+    ONLY_AVX2(fdr_exec_fat_teddy_msks3),
+    ONLY_AVX2(fdr_exec_fat_teddy_msks3_pck),
+    ONLY_AVX2(fdr_exec_fat_teddy_msks4),
+    ONLY_AVX2(fdr_exec_fat_teddy_msks4_pck),
+    fdr_exec_teddy_msks1,
+    fdr_exec_teddy_msks1_pck,
+    fdr_exec_teddy_msks2,
+    fdr_exec_teddy_msks2_pck,
+    fdr_exec_teddy_msks3,
+    fdr_exec_teddy_msks3_pck,
+    fdr_exec_teddy_msks4,
+    fdr_exec_teddy_msks4_pck,
+};
+
+#define FAKE_HISTORY_SIZE 16
+static const u8 fake_history[FAKE_HISTORY_SIZE];
+
+hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len,
+                     size_t start, HWLMCallback cb,
+                     struct hs_scratch *scratch, hwlm_group_t groups) {
+    // We guarantee (for safezone construction) that it is safe to read 16
+    // bytes before the end of the history buffer.
+    const u8 *hbuf = fake_history + FAKE_HISTORY_SIZE;
+
+    const struct FDR_Runtime_Args a = {
+        buf,
+        len,
+        hbuf,
+        0,
+        start,
+        cb,
+        scratch,
+        nextFloodDetect(buf, len, FLOOD_BACKOFF_START),
+        0
+    };
+    if (unlikely(a.start_offset >= a.len)) {
+        return HWLM_SUCCESS;
+    } else {
+        assert(funcs[fdr->engineID]);
+        return funcs[fdr->engineID](fdr, &a, groups);
+    }
+}
+
+hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf,
+                              size_t hlen, const u8 *buf, size_t len,
+                              size_t start, HWLMCallback cb,
+                              struct hs_scratch *scratch,
+                              hwlm_group_t groups) {
+    struct FDR_Runtime_Args a = {
+        buf,
+        len,
+        hbuf,
+        hlen,
+        start,
+        cb,
+        scratch,
+        nextFloodDetect(buf, len, FLOOD_BACKOFF_START),
+        /* we are guaranteed to always have 16 initialised bytes at the end of
+         * the history buffer (they may be garbage). */
+        hbuf ? unaligned_load_u64a(hbuf + hlen - sizeof(u64a)) : (u64a)0
+    };
+
+    hwlm_error_t ret;
+    if (unlikely(a.start_offset >= a.len)) {
+        ret = HWLM_SUCCESS;
+    } else {
+        assert(funcs[fdr->engineID]);
+        ret = funcs[fdr->engineID](fdr, &a, groups);
+    }
+
+    return ret;
+}
diff --git a/regex/fdr/fdr.h b/regex/fdr/fdr.h
new file mode 100644
index 000000000..4dcef851d
--- /dev/null
+++ b/regex/fdr/fdr.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief FDR literal matcher: runtime API.
+ */
+
+#ifndef FDR_H
+#define FDR_H
+
+#include "ue2common.h"
+#include "hwlm/hwlm.h"
+
+// C linkage in the API
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct FDR;
+struct hs_scratch;
+
+/**
+ * \brief Block-mode scan.
+ *
+ * \param fdr FDR matcher engine.
+ * \param buf Buffer to scan.
+ * \param len Length of buffer to scan.
+ * \param start First offset in buf at which a match may start.
+ * \param cb Callback to call when a match is found.
+ * \param scratch Scratch supplied to callback on match.
+ * \param groups Initial groups mask.
+ */
+hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len,
+                     size_t start, HWLMCallback cb, struct hs_scratch *scratch,
+                     hwlm_group_t groups);
+
+/**
+ * \brief Streaming-mode scan.
+ *
+ * \param fdr FDR matcher engine.
+ * \param hbuf History buffer.
+ * \param hlen Length of history buffer (hbuf).
+ * \param buf Buffer to scan.
+ * \param len Length of buffer to scan (buf).
+ * \param start First offset in buf at which a match may start.
+ * \param cb Callback to call when a match is found.
+ * \param scratch Scratch supplied to callback on match.
+ * \param groups Initial groups mask.
+ */
+hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf,
+                              size_t hlen, const u8 *buf, size_t len,
+                              size_t start, HWLMCallback cb,
+                              struct hs_scratch *scratch,
+                              hwlm_group_t groups);
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // FDR_H
diff --git a/regex/fdr/fdr_confirm.h b/regex/fdr/fdr_confirm.h
new file mode 100644
index 000000000..a23082cc6
--- /dev/null
+++ b/regex/fdr/fdr_confirm.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2015-2019, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FDR_CONFIRM_H
+#define FDR_CONFIRM_H
+
+#include "ue2common.h"
+#include "hwlm/hwlm.h"
+
+static really_inline
+u32 mul_hash_64(u64a lv, u64a andmsk, u64a mult, u32 nBits) {
+    return ((lv & andmsk) * mult) >> (sizeof(u64a)*8 - nBits);
+}
+
+// data structures
+// TODO: fix this hard-coding
+#define CONF_TYPE u64a
+#define CONF_HASH_CALL mul_hash_64
+
+/**
+ * \brief Flag indicating this literal doesn't need to be delivered more than
+ * once, used in LitInfo::flags.
+ */
+#define FDR_LIT_FLAG_NOREPEAT   1
+
+/**
+ * \brief Structure describing a literal, linked to by FDRConfirm.
+ *
+ * This structure is followed in memory by a variable-sized string prefix, for
+ * strings that are longer than CONF_TYPE.
+ */
+struct LitInfo {
+    CONF_TYPE v;
+    CONF_TYPE msk;
+    hwlm_group_t groups;
+    u32 id; // literal ID as passed in
+    u8 size;
+    u8 flags; //!< bitfield of flags from FDR_LIT_FLAG_* above.
+    u8 next;
+};
+
+#define FDRC_FLAG_NO_CONFIRM 1
+#define FDRC_FLAG_NOREPEAT   2
+
+/**
+ * \brief FDR confirm header.
+ *
+ * This structure is followed in memory by:
+ *
+ * -# lit index mapping (array of u32)
+ * -# list of LitInfo structures
+ */
+struct FDRConfirm {
+    CONF_TYPE andmsk;
+    CONF_TYPE mult;
+    u32 nBits;
+    hwlm_group_t groups;
+};
+
+static really_inline
+const u32 *getConfirmLitIndex(const struct FDRConfirm *fdrc) {
+    const u8 *base = (const u8 *)fdrc;
+    const u32 *litIndex =
+        (const u32 *)(base + ROUNDUP_N(sizeof(*fdrc), alignof(u32)));
+    assert(ISALIGNED(litIndex));
+    return litIndex;
+}
+
+#endif // FDR_CONFIRM_H
diff --git a/regex/fdr/fdr_confirm_runtime.h b/regex/fdr/fdr_confirm_runtime.h
new file mode 100644
index 000000000..5a2164952
--- /dev/null
+++ b/regex/fdr/fdr_confirm_runtime.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2015-2019, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FDR_CONFIRM_RUNTIME_H
+#define FDR_CONFIRM_RUNTIME_H
+
+#include "scratch.h"
+#include "fdr_internal.h"
+#include "fdr_loadval.h"
+#include "hwlm/hwlm.h"
+#include "ue2common.h"
+#include "util/bitutils.h"
+#include "util/compare.h"
+
+// this is ordinary confirmation function which runs through
+// the whole confirmation procedure
+static really_inline
+void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a,
+                 size_t i, hwlmcb_rv_t *control, u32 *last_match,
+                 u64a conf_key, u64a *conf, u8 bit) {
+    assert(i < a->len);
+    assert(i >= a->start_offset);
+    assert(ISALIGNED(fdrc));
+
+    const u8 * buf = a->buf;
+    u32 c = CONF_HASH_CALL(conf_key, fdrc->andmsk, fdrc->mult,
+                           fdrc->nBits);
+    u32 start = getConfirmLitIndex(fdrc)[c];
+    if (likely(!start)) {
+        return;
+    }
+
+    const struct LitInfo *li
+        = (const struct LitInfo *)((const u8 *)fdrc + start);
+
+    struct hs_scratch *scratch = a->scratch;
+    assert(!scratch->fdr_conf);
+    scratch->fdr_conf = conf;
+    scratch->fdr_conf_offset = bit;
+    u8 oldNext; // initialized in loop
+    do {
+        assert(ISALIGNED(li));
+
+        if (unlikely((conf_key & li->msk) != li->v)) {
+            goto out;
+        }
+
+        if ((*last_match == li->id) && (li->flags & FDR_LIT_FLAG_NOREPEAT)) {
+            goto out;
+        }
+
+        const u8 *loc = buf + i - li->size + 1;
+
+        if (loc < buf) {
+            u32 full_overhang = buf - loc;
+            size_t len_history = a->len_history;
+
+            // can't do a vectored confirm either if we don't have
+            // the bytes
+            if (full_overhang > len_history) {
+                goto out;
+            }
+        }
+        assert(li->size <= sizeof(CONF_TYPE));
+
+        if (unlikely(!(li->groups & *control))) {
+            goto out;
+        }
+
+        *last_match = li->id;
+        *control = a->cb(i, li->id, scratch);
+    out:
+        oldNext = li->next; // oldNext is either 0 or an 'adjust' value
+        li++;
+    } while (oldNext);
+    scratch->fdr_conf = NULL;
+}
+
+#endif
diff --git a/regex/fdr/fdr_internal.h b/regex/fdr/fdr_internal.h
new file mode 100644
index 000000000..c79f61c1f
--- /dev/null
+++ b/regex/fdr/fdr_internal.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief FDR literal matcher: data structures.
+ */
+
+#ifndef FDR_INTERNAL_H
+#define FDR_INTERNAL_H
+
+#include "ue2common.h"
+#include "hwlm/hwlm.h" // for hwlm_group_t, HWLMCallback
+
+struct hs_scratch;
+
+typedef enum {
+    NOT_CAUTIOUS, //!< not near a boundary (quantify?)
+    VECTORING     //!< potentially vectoring
+} CautionReason;
+
+/** \brief number of different ids that can be triggered by floods of any given
+ * character. */
+#define FDR_FLOOD_MAX_IDS 16
+
+struct FDRFlood {
+    hwlm_group_t allGroups; //!< all the groups or'd together
+    u32 suffix;
+
+    /** \brief 0 to FDR_FLOOD_MAX_IDS-1 ids that are generated once per char on
+     * a flood.
+     * If larger we won't handle this through the flood path at all. */
+    u16 idCount;
+
+    u32 ids[FDR_FLOOD_MAX_IDS]; //!< the ids
+    hwlm_group_t groups[FDR_FLOOD_MAX_IDS]; //!< group ids to go with string ids
+};
+
+/** \brief FDR structure.
+ *
+ * 1. struct as-is
+ * 2. primary matching table
+ * 3. confirm stuff
+ */
+struct FDR {
+    u32 engineID;
+    u32 size;
+    u32 maxStringLen;
+    u32 numStrings;
+    u32 confOffset;
+    u32 floodOffset;
+    u8 stride; /* stride - how frequently the data is consulted by the first
+                * stage matcher */
+    u8 domain; /* number of bits used to index into main FDR table. This value
+                * is used only of debugging/asserts. */
+    u16 domainMask; /* pre-computed domain mask */
+    u32 tabSize; /* pre-computed hashtable size in bytes */
+    m128 start; /* initial start state to use at offset 0. The state has been
+                 * set up based on the min length of buckets to reduce the need
+                 * for pointless confirms. */
+};
+
+/** \brief FDR runtime arguments.
+ *
+ * This structure handles read-only things that are passed extensively around
+ * the FDR run-time functions. They are set by the API, passed by value into
+ * the main function, then a pointer is passed around to all the various
+ * sub-functions (confirm & flood). */
+struct FDR_Runtime_Args {
+    const u8 *buf;
+    size_t len;
+    const u8 *buf_history;
+    size_t len_history;
+    size_t start_offset;
+    HWLMCallback cb;
+    struct hs_scratch *scratch;
+    const u8 *firstFloodDetect;
+    const u64a histBytes;
+};
+
+#endif
diff --git a/regex/fdr/fdr_loadval.h b/regex/fdr/fdr_loadval.h
new file mode 100644
index 000000000..86c39c7f3
--- /dev/null
+++ b/regex/fdr/fdr_loadval.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FDR_LOADVAL_H
+#define FDR_LOADVAL_H
+
+#include "ue2common.h"
+#include "util/unaligned.h"
+
+#define MAKE_LOADVAL(type, name)                \
+    static really_inline                                                \
+    type name(const u8 *ptr, UNUSED const u8 *lo, UNUSED const u8 *hi)
+
+#define NORMAL_SAFE(type)                                               \
+    do {                                                                \
+        assert(ptr >= lo);                                              \
+        assert(ptr + sizeof(type) - 1 < hi);                            \
+    } while(0)
+
+#define MAKE_LOOP_CE(TYPE)                                              \
+    TYPE v = 0;                                                         \
+    for (TYPE i = 0; i < sizeof(TYPE); i++) {                           \
+        if ((lo <= ptr + i) && (ptr + i < hi)) {                        \
+            v += (TYPE)ptr[i] << (i*8);                                 \
+        }                                                               \
+    }                                                                   \
+    return v;
+
+// no suffix = normal (unaligned)
+// _ce       = cautious everywhere (in both directions); test against hi and lo
+
+MAKE_LOADVAL(u16, lv_u16) {
+    NORMAL_SAFE(u16);
+    return unaligned_load_u16(ptr);
+}
+
+MAKE_LOADVAL(u64a, lv_u64a) {
+    NORMAL_SAFE(u32);
+    return unaligned_load_u64a(ptr);
+}
+
+MAKE_LOADVAL(u16, lv_u16_ce) { MAKE_LOOP_CE(u16); }
+
+MAKE_LOADVAL(u64a, lv_u64a_ce) { MAKE_LOOP_CE(u64a); }
+
+#endif
diff --git a/regex/fdr/flood_runtime.h b/regex/fdr/flood_runtime.h
new file mode 100644
index 000000000..2d5a32d92
--- /dev/null
+++ b/regex/fdr/flood_runtime.h
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FLOOD_RUNTIME
+#define FLOOD_RUNTIME
+
+#if defined(ARCH_64_BIT)
+#define FLOOD_64
+#else
+#define FLOOD_32
+#endif
+#define FLOOD_MINIMUM_SIZE 256
+#define FLOOD_BACKOFF_START 32
+
+static really_inline
+const u8 * nextFloodDetect(const u8 * buf, size_t len, u32 floodBackoff) {
+    // if we don't have a flood at either the start or end,
+    // or have a very small buffer, don't bother with flood detection
+    if (len < FLOOD_MINIMUM_SIZE) {
+        return buf + len;
+    }
+
+    /* entry points in runtime.c prefetch relevant data */
+#ifndef FLOOD_32
+    u64a x11 = *(const u64a *)ROUNDUP_PTR(buf, 8);
+    u64a x12 = *(const u64a *)ROUNDUP_PTR(buf+8, 8);
+    if (x11 == x12) {
+        return buf + floodBackoff;
+    }
+    u64a x21 = *(const u64a *)ROUNDUP_PTR(buf + len/2, 8);
+    u64a x22 = *(const u64a *)ROUNDUP_PTR(buf + len/2 + 8, 8);
+    if (x21 == x22) {
+        return buf + floodBackoff;
+    }
+    u64a x31 = *(const u64a *)ROUNDUP_PTR(buf + len - 24, 8);
+    u64a x32 = *(const u64a *)ROUNDUP_PTR(buf + len - 16, 8);
+    if (x31 == x32) {
+        return buf + floodBackoff;
+    }
+#else
+    u32 x11 = *(const u32 *)ROUNDUP_PTR(buf, 4);
+    u32 x12 = *(const u32 *)ROUNDUP_PTR(buf+4, 4);
+    if (x11 == x12) {
+        return buf + floodBackoff;
+    }
+    u32 x21 = *(const u32 *)ROUNDUP_PTR(buf + len/2, 4);
+    u32 x22 = *(const u32 *)ROUNDUP_PTR(buf + len/2 + 4, 4);
+    if (x21 == x22) {
+        return buf + floodBackoff;
+    }
+    u32 x31 = *(const u32 *)ROUNDUP_PTR(buf + len - 12, 4);
+    u32 x32 = *(const u32 *)ROUNDUP_PTR(buf + len - 8, 4);
+    if (x31 == x32) {
+        return buf + floodBackoff;
+    }
+#endif
+    return buf + len;
+}
+
+static really_inline
+const u8 * floodDetect(const struct FDR * fdr,
+                       const struct FDR_Runtime_Args * a,
+                       const u8 ** ptrPtr,
+                       const u8 * tryFloodDetect,
+                       u32 * floodBackoffPtr,
+                       hwlmcb_rv_t * control,
+                       u32 iterBytes) {
+    DEBUG_PRINTF("attempting flood detection at %p\n", tryFloodDetect);
+    const u8 * buf = a->buf;
+    const size_t len = a->len;
+    HWLMCallback cb = a->cb;
+    struct hs_scratch *scratch = a->scratch;
+
+    const u8 * ptr = *ptrPtr;
+    // tryFloodDetect is never put in places where unconditional
+    // reads a short distance forward or backward here
+    // TODO: rationale for this line needs to be rediscovered!!
+    size_t mainLoopLen = len > 2 * iterBytes ? len - 2 * iterBytes : 0;
+    const u32 i = ptr - buf;
+    u32 j = i;
+
+    // go from c to our FDRFlood structure
+    u8 c = buf[i];
+    const u8 * fBase = ((const u8 *)fdr) + fdr->floodOffset;
+    u32 fIdx = ((const u32 *)fBase)[c];
+    const struct FDRFlood * fsb = (const struct FDRFlood *)(fBase + sizeof(u32) * 256);
+    const struct FDRFlood * fl = &fsb[fIdx];
+
+#ifndef FLOOD_32
+    u64a cmpVal = c;
+    cmpVal |= cmpVal << 8;
+    cmpVal |= cmpVal << 16;
+    cmpVal |= cmpVal << 32;
+    u64a probe = *(const u64a *)ROUNDUP_PTR(buf+i, 8);
+#else
+    u32 cmpVal = c;
+    cmpVal |= cmpVal << 8;
+    cmpVal |= cmpVal << 16;
+    u32 probe = *(const u32 *)ROUNDUP_PTR(buf+i, 4);
+#endif
+
+    if ((probe != cmpVal) || (fl->idCount >= FDR_FLOOD_MAX_IDS)) {
+        *floodBackoffPtr *= 2;
+        goto floodout;
+    }
+
+    if (i < fl->suffix + 7) {
+        *floodBackoffPtr *= 2;
+        goto floodout;
+    }
+
+    j = i - fl->suffix;
+
+#ifndef FLOOD_32
+    j -= (u32)((uintptr_t)buf + j) & 0x7; // push j back to yield 8-aligned addrs
+    for (; j + 32 < mainLoopLen; j += 32) {
+        u64a v = *(const u64a *)(buf + j);
+        u64a v2 = *(const u64a *)(buf + j + 8);
+        u64a v3 = *(const u64a *)(buf + j + 16);
+        u64a v4 = *(const u64a *)(buf + j + 24);
+        if ((v4 != cmpVal) || (v3 != cmpVal) || (v2 != cmpVal) || (v != cmpVal)) {
+            break;
+        }
+    }
+    for (; j + 8 < mainLoopLen; j += 8) {
+        u64a v = *(const u64a *)(buf + j);
+        if (v != cmpVal) {
+            break;
+        }
+    }
+#else
+    j -= (u32)((size_t)buf + j) & 0x3; // push j back to yield 4-aligned addrs
+    for (; j + 16 < mainLoopLen; j += 16) {
+        u32 v = *(const u32 *)(buf + j);
+        u32 v2 = *(const u32 *)(buf + j + 4);
+        u32 v3 = *(const u32 *)(buf + j + 8);
+        u32 v4 = *(const u32 *)(buf + j + 12);
+        if ((v4 != cmpVal) || (v3 != cmpVal) || (v2 != cmpVal) || (v != cmpVal)) {
+            break;
+        }
+    }
+    for (; j + 4 < mainLoopLen; j += 4) {
+        u32 v = *(const u32 *)(buf + j);
+        if (v != cmpVal) {
+            break;
+        }
+    }
+#endif
+    for (; j < mainLoopLen; j++) {
+        u8 v = *(const u8 *)(buf + j);
+        if (v != c) {
+            break;
+        }
+    }
+    if (j > i ) {
+        j--; // needed for some reaches
+        u32 itersAhead = (j-i)/iterBytes;
+        u32 floodSize = itersAhead*iterBytes;
+
+        DEBUG_PRINTF("flooding %u size j %u i %u fl->idCount %hu "
+                     "*control %016llx fl->allGroups %016llx\n",
+                     floodSize, j, i, fl->idCount, *control, fl->allGroups);
+        DEBUG_PRINTF("mainloopLen %zu mainStart ??? mainEnd ??? len %zu\n",
+                     mainLoopLen, len);
+
+        if (fl->idCount && (*control & fl->allGroups)) {
+            switch (fl->idCount) {
+#if !defined(FLOOD_DEBUG)
+            // Carefully unrolled code
+            case 1:
+                for (u32 t = 0; t < floodSize && (*control & fl->allGroups);
+                     t += 4) {
+                    DEBUG_PRINTF("aaa %u %llx\n", t, fl->groups[0]);
+                    if (*control & fl->groups[0]) {
+                        *control = cb(i + t + 0, fl->ids[0], scratch);
+                    }
+                    if (*control & fl->groups[0]) {
+                        *control = cb(i + t + 1, fl->ids[0], scratch);
+                    }
+                    if (*control & fl->groups[0]) {
+                        *control = cb(i + t + 2, fl->ids[0], scratch);
+                    }
+                    if (*control & fl->groups[0]) {
+                        *control = cb(i + t + 3, fl->ids[0], scratch);
+                    }
+                }
+                break;
+            case 2:
+                for (u32 t = 0; t < floodSize && (*control & fl->allGroups); t += 4) {
+                    if (*control & fl->groups[0]) {
+                        *control = cb(i + t, fl->ids[0], scratch);
+                    }
+                    if (*control & fl->groups[1]) {
+                        *control = cb(i + t, fl->ids[1], scratch);
+                    }
+                    if (*control & fl->groups[0]) {
+                        *control =
+                            cb(i + t + 1, fl->ids[0], scratch);
+                    }
+                    if (*control & fl->groups[1]) {
+                        *control = cb(i + t + 1, fl->ids[1], scratch);
+                    }
+                    if (*control & fl->groups[0]) {
+                        *control = cb(i + t + 2, fl->ids[0], scratch);
+                    }
+                    if (*control & fl->groups[1]) {
+                        *control = cb(i + t + 2, fl->ids[1], scratch);
+                    }
+                    if (*control & fl->groups[0]) {
+                        *control = cb(i + t + 3, fl->ids[0], scratch);
+                    }
+                    if (*control & fl->groups[1]) {
+                        *control = cb(i + t + 3, fl->ids[1], scratch);
+                    }
+                }
+                break;
+            case 3:
+                for (u32 t = 0; t < floodSize && (*control & fl->allGroups); t += 2) {
+                    if (*control & fl->groups[0]) {
+                        *control = cb(i + t, fl->ids[0], scratch);
+                    }
+                    if (*control & fl->groups[1]) {
+                        *control = cb(i + t, fl->ids[1], scratch);
+                    }
+                    if (*control & fl->groups[2]) {
+                        *control = cb(i + t, fl->ids[2], scratch);
+                    }
+                    if (*control & fl->groups[0]) {
+                        *control = cb(i + t + 1, fl->ids[0], scratch);
+                    }
+                    if (*control & fl->groups[1]) {
+                        *control = cb(i + t + 1, fl->ids[1], scratch);
+                    }
+                    if (*control & fl->groups[2]) {
+                        *control = cb(i + t + 1, fl->ids[2], scratch);
+                    }
+                }
+                break;
+            default:
+                // slow generalized loop
+                for (u32 t = 0; t < floodSize && (*control & fl->allGroups); t += 2) {
+
+                    if (*control & fl->groups[0]) {
+                        *control = cb(i + t, fl->ids[0], scratch);
+                    }
+                    if (*control & fl->groups[1]) {
+                        *control = cb(i + t, fl->ids[1], scratch);
+                    }
+                    if (*control & fl->groups[2]) {
+                        *control = cb(i + t, fl->ids[2], scratch);
+                    }
+                    if (*control & fl->groups[3]) {
+                        *control = cb(i + t, fl->ids[3], scratch);
+                    }
+
+                    for (u32 t2 = 4; t2 < fl->idCount; t2++) {
+                        if (*control & fl->groups[t2]) {
+                            *control = cb(i + t, fl->ids[t2], scratch);
+                        }
+                    }
+
+                    if (*control & fl->groups[0]) {
+                        *control = cb(i + t + 1, fl->ids[0], scratch);
+                    }
+                    if (*control & fl->groups[1]) {
+                        *control = cb(i + t + 1, fl->ids[1], scratch);
+                    }
+                    if (*control & fl->groups[2]) {
+                        *control = cb(i + t + 1, fl->ids[2], scratch);
+                    }
+                    if (*control & fl->groups[3]) {
+                        *control = cb(i + t + 1, fl->ids[3], scratch);
+                    }
+
+                    for (u32 t2 = 4; t2 < fl->idCount; t2++) {
+                        if (*control & fl->groups[t2]) {
+                            *control = cb(i + t + 1, fl->ids[t2], scratch);
+                        }
+                    }
+                }
+                break;
+#else
+            // Fallback for debugging
+            default:
+                for (u32 t = 0; t < floodSize && (*control & fl->allGroups); t++) {
+                    for (u32 t2 = 0; t2 < fl->idCount; t2++) {
+                        if (*control & fl->groups[t2]) {
+                            *control = cb(i + t, fl->ids[t2], scratch);
+                        }
+                    }
+                }
+#endif
+            }
+        }
+        ptr += floodSize;
+    } else {
+        *floodBackoffPtr *= 2;
+    }
+
+floodout:
+    if (j + *floodBackoffPtr < mainLoopLen - 128) {
+        tryFloodDetect = buf + MAX(i,j) + *floodBackoffPtr;
+    } else {
+        tryFloodDetect = buf + mainLoopLen; // set so we never do another flood detect
+    }
+    *ptrPtr = ptr;
+    DEBUG_PRINTF("finished flood detection at %p (next check %p)\n",
+                 ptr, tryFloodDetect);
+    return tryFloodDetect;
+}
+
+#endif
diff --git a/regex/fdr/teddy.c b/regex/fdr/teddy.c
new file mode 100644
index 000000000..e6f547619
--- /dev/null
+++ b/regex/fdr/teddy.c
@@ -0,0 +1,1114 @@
+/*
+ * Copyright (c) 2015-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Teddy literal matcher: SSSE3 engine runtime.
+ */
+
+#include "fdr_internal.h"
+#include "flood_runtime.h"
+#include "teddy.h"
+#include "teddy_internal.h"
+#include "teddy_runtime_common.h"
+#include "util/simd_utils.h"
+
+const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = {
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
+};
+
+#if defined(HAVE_AVX512VBMI) // VBMI strong teddy
+
+#define CONF_CHUNK_64(chunk, bucket, off, reason, pt, conf_fn)              \
+do {                                                                        \
+    if (unlikely(chunk != ones_u64a)) {                                     \
+        chunk = ~chunk;                                                     \
+        conf_fn(&chunk, bucket, off, confBase, reason, a, pt,               \
+                &control, &last_match);                                     \
+        CHECK_HWLM_TERMINATE_MATCHING;                                      \
+    }                                                                       \
+} while(0)
+
+#define CONF_CHUNK_32(chunk, bucket, off, reason, pt, conf_fn)              \
+do {                                                                        \
+    if (unlikely(chunk != ones_u32)) {                                      \
+        chunk = ~chunk;                                                     \
+        conf_fn(&chunk, bucket, off, confBase, reason, a, pt,               \
+                &control, &last_match);                                     \
+        CHECK_HWLM_TERMINATE_MATCHING;                                      \
+    }                                                                       \
+} while(0)
+
+#else
+
+#define CONF_CHUNK_64(chunk, bucket, off, reason, conf_fn)                  \
+do {                                                                        \
+    if (unlikely(chunk != ones_u64a)) {                                     \
+        chunk = ~chunk;                                                     \
+        conf_fn(&chunk, bucket, off, confBase, reason, a, ptr,              \
+                &control, &last_match);                                     \
+        CHECK_HWLM_TERMINATE_MATCHING;                                      \
+    }                                                                       \
+} while(0)
+
+#define CONF_CHUNK_32(chunk, bucket, off, reason, conf_fn)                  \
+do {                                                                        \
+    if (unlikely(chunk != ones_u32)) {                                      \
+        chunk = ~chunk;                                                     \
+        conf_fn(&chunk, bucket, off, confBase, reason, a, ptr,              \
+                &control, &last_match);                                     \
+        CHECK_HWLM_TERMINATE_MATCHING;                                      \
+    }                                                                       \
+} while(0)
+
+#endif
+
+#if defined(HAVE_AVX512VBMI) // VBMI strong teddy
+
+#ifdef ARCH_64_BIT
+#define CONFIRM_TEDDY(var, bucket, offset, reason, pt, conf_fn)             \
+do {                                                                        \
+    if (unlikely(diff512(var, ones512()))) {                                \
+        m128 p128_0 = extract128from512(var, 0);                            \
+        m128 p128_1 = extract128from512(var, 1);                            \
+        m128 p128_2 = extract128from512(var, 2);                            \
+        m128 p128_3 = extract128from512(var, 3);                            \
+        u64a part1 = movq(p128_0);                                          \
+        u64a part2 = movq(rshiftbyte_m128(p128_0, 8));                      \
+        u64a part3 = movq(p128_1);                                          \
+        u64a part4 = movq(rshiftbyte_m128(p128_1, 8));                      \
+        u64a part5 = movq(p128_2);                                          \
+        u64a part6 = movq(rshiftbyte_m128(p128_2, 8));                      \
+        u64a part7 = movq(p128_3);                                          \
+        u64a part8 = movq(rshiftbyte_m128(p128_3, 8));                      \
+        CONF_CHUNK_64(part1, bucket, offset, reason, pt, conf_fn);          \
+        CONF_CHUNK_64(part2, bucket, offset + 8, reason, pt, conf_fn);      \
+        CONF_CHUNK_64(part3, bucket, offset + 16, reason, pt, conf_fn);     \
+        CONF_CHUNK_64(part4, bucket, offset + 24, reason, pt, conf_fn);     \
+        CONF_CHUNK_64(part5, bucket, offset + 32, reason, pt, conf_fn);     \
+        CONF_CHUNK_64(part6, bucket, offset + 40, reason, pt, conf_fn);     \
+        CONF_CHUNK_64(part7, bucket, offset + 48, reason, pt, conf_fn);     \
+        CONF_CHUNK_64(part8, bucket, offset + 56, reason, pt, conf_fn);     \
+    }                                                                       \
+} while(0)
+#else
+#define CONFIRM_TEDDY(var, bucket, offset, reason, pt, conf_fn)             \
+do {                                                                        \
+    if (unlikely(diff512(var, ones512()))) {                                \
+        m128 p128_0 = extract128from512(var, 0);                            \
+        m128 p128_1 = extract128from512(var, 1);                            \
+        m128 p128_2 = extract128from512(var, 2);                            \
+        m128 p128_3 = extract128from512(var, 3);                            \
+        u32 part1 = movd(p128_0);                                           \
+        u32 part2 = movd(rshiftbyte_m128(p128_0, 4));                       \
+        u32 part3 = movd(rshiftbyte_m128(p128_0, 8));                       \
+        u32 part4 = movd(rshiftbyte_m128(p128_0, 12));                      \
+        u32 part5 = movd(p128_1);                                           \
+        u32 part6 = movd(rshiftbyte_m128(p128_1, 4));                       \
+        u32 part7 = movd(rshiftbyte_m128(p128_1, 8));                       \
+        u32 part8 = movd(rshiftbyte_m128(p128_1, 12));                      \
+        u32 part9 = movd(p128_2);                                           \
+        u32 part10 = movd(rshiftbyte_m128(p128_2, 4));                      \
+        u32 part11 = movd(rshiftbyte_m128(p128_2, 8));                      \
+        u32 part12 = movd(rshiftbyte_m128(p128_2, 12));                     \
+        u32 part13 = movd(p128_3);                                          \
+        u32 part14 = movd(rshiftbyte_m128(p128_3, 4));                      \
+        u32 part15 = movd(rshiftbyte_m128(p128_3, 8));                      \
+        u32 part16 = movd(rshiftbyte_m128(p128_3, 12));                     \
+        CONF_CHUNK_32(part1, bucket, offset, reason, pt, conf_fn);          \
+        CONF_CHUNK_32(part2, bucket, offset + 4, reason, pt, conf_fn);      \
+        CONF_CHUNK_32(part3, bucket, offset + 8, reason, pt, conf_fn);      \
+        CONF_CHUNK_32(part4, bucket, offset + 12, reason, pt, conf_fn);     \
+        CONF_CHUNK_32(part5, bucket, offset + 16, reason, pt, conf_fn);     \
+        CONF_CHUNK_32(part6, bucket, offset + 20, reason, pt, conf_fn);     \
+        CONF_CHUNK_32(part7, bucket, offset + 24, reason, pt, conf_fn);     \
+        CONF_CHUNK_32(part8, bucket, offset + 28, reason, pt, conf_fn);     \
+        CONF_CHUNK_32(part9, bucket, offset + 32, reason, pt, conf_fn);     \
+        CONF_CHUNK_32(part10, bucket, offset + 36, reason, pt, conf_fn);    \
+        CONF_CHUNK_32(part11, bucket, offset + 40, reason, pt, conf_fn);    \
+        CONF_CHUNK_32(part12, bucket, offset + 44, reason, pt, conf_fn);    \
+        CONF_CHUNK_32(part13, bucket, offset + 48, reason, pt, conf_fn);    \
+        CONF_CHUNK_32(part14, bucket, offset + 52, reason, pt, conf_fn);    \
+        CONF_CHUNK_32(part15, bucket, offset + 56, reason, pt, conf_fn);    \
+        CONF_CHUNK_32(part16, bucket, offset + 60, reason, pt, conf_fn);    \
+    }                                                                       \
+} while(0)
+#endif
+
+#define PREP_SHUF_MASK                                                      \
+    m512 lo = and512(val, *lo_mask);                                        \
+    m512 hi = and512(rshift64_m512(val, 4), *lo_mask)
+
+#define TEDDY_VBMI_PSHUFB_OR_M1                              \
+    m512 shuf_or_b0 = or512(pshufb_m512(dup_mask[0], lo),    \
+                            pshufb_m512(dup_mask[1], hi));
+
+#define TEDDY_VBMI_PSHUFB_OR_M2                              \
+    TEDDY_VBMI_PSHUFB_OR_M1                                  \
+    m512 shuf_or_b1 = or512(pshufb_m512(dup_mask[2], lo),    \
+                            pshufb_m512(dup_mask[3], hi));
+
+#define TEDDY_VBMI_PSHUFB_OR_M3                              \
+    TEDDY_VBMI_PSHUFB_OR_M2                                  \
+    m512 shuf_or_b2 = or512(pshufb_m512(dup_mask[4], lo),    \
+                            pshufb_m512(dup_mask[5], hi));
+
+#define TEDDY_VBMI_PSHUFB_OR_M4                              \
+    TEDDY_VBMI_PSHUFB_OR_M3                                  \
+    m512 shuf_or_b3 = or512(pshufb_m512(dup_mask[6], lo),    \
+                            pshufb_m512(dup_mask[7], hi));
+
+#define TEDDY_VBMI_SL1_MASK   0xfffffffffffffffeULL
+#define TEDDY_VBMI_SL2_MASK   0xfffffffffffffffcULL
+#define TEDDY_VBMI_SL3_MASK   0xfffffffffffffff8ULL
+
+#define TEDDY_VBMI_SHIFT_M1
+
+#define TEDDY_VBMI_SHIFT_M2                      \
+    TEDDY_VBMI_SHIFT_M1                          \
+    m512 sl1 = maskz_vpermb512(TEDDY_VBMI_SL1_MASK, sl_msk[0], shuf_or_b1);
+
+#define TEDDY_VBMI_SHIFT_M3                      \
+    TEDDY_VBMI_SHIFT_M2                          \
+    m512 sl2 = maskz_vpermb512(TEDDY_VBMI_SL2_MASK, sl_msk[1], shuf_or_b2);
+
+#define TEDDY_VBMI_SHIFT_M4                      \
+    TEDDY_VBMI_SHIFT_M3                          \
+    m512 sl3 = maskz_vpermb512(TEDDY_VBMI_SL3_MASK, sl_msk[2], shuf_or_b3);
+
+#define SHIFT_OR_M1            \
+    shuf_or_b0
+
+#define SHIFT_OR_M2            \
+    or512(sl1, SHIFT_OR_M1)
+
+#define SHIFT_OR_M3            \
+    or512(sl2, SHIFT_OR_M2)
+
+#define SHIFT_OR_M4            \
+    or512(sl3, SHIFT_OR_M3)
+
+static really_inline
+m512 prep_conf_teddy_m1(const m512 *lo_mask, const m512 *dup_mask,
+                        UNUSED const m512 *sl_msk, const m512 val) {
+    PREP_SHUF_MASK;
+    TEDDY_VBMI_PSHUFB_OR_M1;
+    TEDDY_VBMI_SHIFT_M1;
+    return SHIFT_OR_M1;
+}
+
+static really_inline
+m512 prep_conf_teddy_m2(const m512 *lo_mask, const m512 *dup_mask,
+                        const m512 *sl_msk, const m512 val) {
+    PREP_SHUF_MASK;
+    TEDDY_VBMI_PSHUFB_OR_M2;
+    TEDDY_VBMI_SHIFT_M2;
+    return SHIFT_OR_M2;
+}
+
+static really_inline
+m512 prep_conf_teddy_m3(const m512 *lo_mask, const m512 *dup_mask,
+                        const m512 *sl_msk, const m512 val) {
+    PREP_SHUF_MASK;
+    TEDDY_VBMI_PSHUFB_OR_M3;
+    TEDDY_VBMI_SHIFT_M3;
+    return SHIFT_OR_M3;
+}
+
+static really_inline
+m512 prep_conf_teddy_m4(const m512 *lo_mask, const m512 *dup_mask,
+                        const m512 *sl_msk, const m512 val) {
+    PREP_SHUF_MASK;
+    TEDDY_VBMI_PSHUFB_OR_M4;
+    TEDDY_VBMI_SHIFT_M4;
+    return SHIFT_OR_M4;
+}
+
+#define PREP_CONF_FN(val, n)                                                  \
+    prep_conf_teddy_m##n(&lo_mask, dup_mask, sl_msk, val)
+
+#define TEDDY_VBMI_SL1_POS    15
+#define TEDDY_VBMI_SL2_POS    14
+#define TEDDY_VBMI_SL3_POS    13
+
+#define TEDDY_VBMI_LOAD_SHIFT_MASK_M1
+
+#define TEDDY_VBMI_LOAD_SHIFT_MASK_M2    \
+    TEDDY_VBMI_LOAD_SHIFT_MASK_M1        \
+    sl_msk[0] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL1_POS);
+
+#define TEDDY_VBMI_LOAD_SHIFT_MASK_M3    \
+    TEDDY_VBMI_LOAD_SHIFT_MASK_M2        \
+    sl_msk[1] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL2_POS);
+
+#define TEDDY_VBMI_LOAD_SHIFT_MASK_M4    \
+    TEDDY_VBMI_LOAD_SHIFT_MASK_M3        \
+    sl_msk[2] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL3_POS);
+
+#define PREPARE_MASKS_1                                                       \
+    dup_mask[0] = set4x128(maskBase[0]);                                      \
+    dup_mask[1] = set4x128(maskBase[1]);
+
+#define PREPARE_MASKS_2                                                       \
+    PREPARE_MASKS_1                                                           \
+    dup_mask[2] = set4x128(maskBase[2]);                                      \
+    dup_mask[3] = set4x128(maskBase[3]);
+
+#define PREPARE_MASKS_3                                                       \
+    PREPARE_MASKS_2                                                           \
+    dup_mask[4] = set4x128(maskBase[4]);                                      \
+    dup_mask[5] = set4x128(maskBase[5]);
+
+#define PREPARE_MASKS_4                                                       \
+    PREPARE_MASKS_3                                                           \
+    dup_mask[6] = set4x128(maskBase[6]);                                      \
+    dup_mask[7] = set4x128(maskBase[7]);
+
+#define PREPARE_MASKS(n)                                                      \
+    m512 lo_mask = set64x8(0xf);                                              \
+    m512 dup_mask[n * 2];                                                     \
+    m512 sl_msk[n - 1];                                                       \
+    PREPARE_MASKS_##n                                                         \
+    TEDDY_VBMI_LOAD_SHIFT_MASK_M##n
+
+#define TEDDY_VBMI_CONF_MASK_HEAD   (0xffffffffffffffffULL >> n_sh)
+#define TEDDY_VBMI_CONF_MASK_FULL   (0xffffffffffffffffULL << n_sh)
+#define TEDDY_VBMI_CONF_MASK_VAR(n) (0xffffffffffffffffULL >> (64 - n) << overlap)
+#define TEDDY_VBMI_LOAD_MASK_PATCH  (0xffffffffffffffffULL >> (64 - n_sh))
+
+#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn)                       \
+do {                                                                          \
+    const u8 *buf_end = a->buf + a->len;                                      \
+    const u8 *ptr = a->buf + a->start_offset;                                 \
+    u32 floodBackoff = FLOOD_BACKOFF_START;                                   \
+    const u8 *tryFloodDetect = a->firstFloodDetect;                           \
+    u32 last_match = ones_u32;                                                \
+    const struct Teddy *teddy = (const struct Teddy *)fdr;                    \
+    const size_t iterBytes = 64;                                              \
+    u32 n_sh = n_msk - 1;                                                     \
+    const size_t loopBytes = 64 - n_sh;                                       \
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",                 \
+                 a->buf, a->len, a->start_offset);                            \
+                                                                              \
+    const m128 *maskBase = getMaskBase(teddy);                                \
+    PREPARE_MASKS(n_msk);                                                     \
+    const u32 *confBase = getConfBase(teddy);                                 \
+                                                                              \
+    u64a k = TEDDY_VBMI_CONF_MASK_FULL;                                       \
+    m512 p_mask = set_mask_m512(~k);                                          \
+    u32 overlap = 0;                                                          \
+    u64a patch = 0;                                                           \
+    if (likely(ptr + loopBytes <= buf_end)) {                                 \
+        m512 p_mask0 = set_mask_m512(~TEDDY_VBMI_CONF_MASK_HEAD);             \
+        m512 r_0 = PREP_CONF_FN(loadu512(ptr), n_msk);                        \
+        r_0 = or512(r_0, p_mask0);                                            \
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, ptr, conf_fn);                    \
+        ptr += loopBytes;                                                     \
+        overlap = n_sh;                                                       \
+        patch = TEDDY_VBMI_LOAD_MASK_PATCH;                                   \
+    }                                                                         \
+                                                                              \
+    for (; ptr + loopBytes <= buf_end; ptr += loopBytes) {                    \
+        __builtin_prefetch(ptr - n_sh + (64 * 2));                            \
+        CHECK_FLOOD;                                                          \
+        m512 r_0 = PREP_CONF_FN(loadu512(ptr - n_sh), n_msk);                 \
+        r_0 = or512(r_0, p_mask);                                             \
+        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, ptr - n_sh, conf_fn);          \
+    }                                                                         \
+                                                                              \
+    assert(ptr + loopBytes > buf_end);                                        \
+    if (ptr < buf_end) {                                                      \
+        u32 left = (u32)(buf_end - ptr);                                      \
+        u64a k1 = TEDDY_VBMI_CONF_MASK_VAR(left);                             \
+        m512 p_mask1 = set_mask_m512(~k1);                                    \
+        m512 val_0 = loadu_maskz_m512(k1 | patch, ptr - overlap);             \
+        m512 r_0 = PREP_CONF_FN(val_0, n_msk);                                \
+        r_0 = or512(r_0, p_mask1);                                            \
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, ptr - overlap, conf_fn);          \
+    }                                                                         \
+                                                                              \
+    return HWLM_SUCCESS;                                                      \
+} while(0)
+
+#elif defined(HAVE_AVX512) // AVX512 reinforced teddy
+
+#ifdef ARCH_64_BIT
+#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn)                 \
+do {                                                                        \
+    if (unlikely(diff512(var, ones512()))) {                                \
+        m128 p128_0 = extract128from512(var, 0);                            \
+        m128 p128_1 = extract128from512(var, 1);                            \
+        m128 p128_2 = extract128from512(var, 2);                            \
+        m128 p128_3 = extract128from512(var, 3);                            \
+        u64a part1 = movq(p128_0);                                          \
+        u64a part2 = movq(rshiftbyte_m128(p128_0, 8));                      \
+        u64a part3 = movq(p128_1);                                          \
+        u64a part4 = movq(rshiftbyte_m128(p128_1, 8));                      \
+        u64a part5 = movq(p128_2);                                          \
+        u64a part6 = movq(rshiftbyte_m128(p128_2, 8));                      \
+        u64a part7 = movq(p128_3);                                          \
+        u64a part8 = movq(rshiftbyte_m128(p128_3, 8));                      \
+        CONF_CHUNK_64(part1, bucket, offset, reason, conf_fn);              \
+        CONF_CHUNK_64(part2, bucket, offset + 8, reason, conf_fn);          \
+        CONF_CHUNK_64(part3, bucket, offset + 16, reason, conf_fn);         \
+        CONF_CHUNK_64(part4, bucket, offset + 24, reason, conf_fn);         \
+        CONF_CHUNK_64(part5, bucket, offset + 32, reason, conf_fn);         \
+        CONF_CHUNK_64(part6, bucket, offset + 40, reason, conf_fn);         \
+        CONF_CHUNK_64(part7, bucket, offset + 48, reason, conf_fn);         \
+        CONF_CHUNK_64(part8, bucket, offset + 56, reason, conf_fn);         \
+    }                                                                       \
+} while(0)
+#else
+#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn)                 \
+do {                                                                        \
+    if (unlikely(diff512(var, ones512()))) {                                \
+        m128 p128_0 = extract128from512(var, 0);                            \
+        m128 p128_1 = extract128from512(var, 1);                            \
+        m128 p128_2 = extract128from512(var, 2);                            \
+        m128 p128_3 = extract128from512(var, 3);                            \
+        u32 part1 = movd(p128_0);                                           \
+        u32 part2 = movd(rshiftbyte_m128(p128_0, 4));                       \
+        u32 part3 = movd(rshiftbyte_m128(p128_0, 8));                       \
+        u32 part4 = movd(rshiftbyte_m128(p128_0, 12));                      \
+        u32 part5 = movd(p128_1);                                           \
+        u32 part6 = movd(rshiftbyte_m128(p128_1, 4));                       \
+        u32 part7 = movd(rshiftbyte_m128(p128_1, 8));                       \
+        u32 part8 = movd(rshiftbyte_m128(p128_1, 12));                      \
+        u32 part9 = movd(p128_2);                                           \
+        u32 part10 = movd(rshiftbyte_m128(p128_2, 4));                      \
+        u32 part11 = movd(rshiftbyte_m128(p128_2, 8));                      \
+        u32 part12 = movd(rshiftbyte_m128(p128_2, 12));                     \
+        u32 part13 = movd(p128_3);                                          \
+        u32 part14 = movd(rshiftbyte_m128(p128_3, 4));                      \
+        u32 part15 = movd(rshiftbyte_m128(p128_3, 8));                      \
+        u32 part16 = movd(rshiftbyte_m128(p128_3, 12));                     \
+        CONF_CHUNK_32(part1, bucket, offset, reason, conf_fn);              \
+        CONF_CHUNK_32(part2, bucket, offset + 4, reason, conf_fn);          \
+        CONF_CHUNK_32(part3, bucket, offset + 8, reason, conf_fn);          \
+        CONF_CHUNK_32(part4, bucket, offset + 12, reason, conf_fn);         \
+        CONF_CHUNK_32(part5, bucket, offset + 16, reason, conf_fn);         \
+        CONF_CHUNK_32(part6, bucket, offset + 20, reason, conf_fn);         \
+        CONF_CHUNK_32(part7, bucket, offset + 24, reason, conf_fn);         \
+        CONF_CHUNK_32(part8, bucket, offset + 28, reason, conf_fn);         \
+        CONF_CHUNK_32(part9, bucket, offset + 32, reason, conf_fn);         \
+        CONF_CHUNK_32(part10, bucket, offset + 36, reason, conf_fn);        \
+        CONF_CHUNK_32(part11, bucket, offset + 40, reason, conf_fn);        \
+        CONF_CHUNK_32(part12, bucket, offset + 44, reason, conf_fn);        \
+        CONF_CHUNK_32(part13, bucket, offset + 48, reason, conf_fn);        \
+        CONF_CHUNK_32(part14, bucket, offset + 52, reason, conf_fn);        \
+        CONF_CHUNK_32(part15, bucket, offset + 56, reason, conf_fn);        \
+        CONF_CHUNK_32(part16, bucket, offset + 60, reason, conf_fn);        \
+    }                                                                       \
+} while(0)
+#endif
+
+#define PREP_SHUF_MASK_NO_REINFORCEMENT(val)                                \
+    m512 lo = and512(val, *lo_mask);                                        \
+    m512 hi = and512(rshift64_m512(val, 4), *lo_mask)
+
+#define PREP_SHUF_MASK                                                      \
+    PREP_SHUF_MASK_NO_REINFORCEMENT(load512(ptr));                          \
+    *c_16 = *(ptr + 15);                                                    \
+    *c_32 = *(ptr + 31);                                                    \
+    *c_48 = *(ptr + 47);                                                    \
+    m512 r_msk = set512_64(0ULL, r_msk_base[*c_48], 0ULL, r_msk_base[*c_32],\
+                           0ULL, r_msk_base[*c_16], 0ULL, r_msk_base[*c_0]);\
+    *c_0 = *(ptr + 63)
+
+#define SHIFT_OR_M1                                                         \
+    or512(pshufb_m512(dup_mask[0], lo), pshufb_m512(dup_mask[1], hi))
+
+#define SHIFT_OR_M2                                                         \
+    or512(lshift128_m512(or512(pshufb_m512(dup_mask[2], lo),                \
+                               pshufb_m512(dup_mask[3], hi)),               \
+                         1), SHIFT_OR_M1)
+
+#define SHIFT_OR_M3                                                         \
+    or512(lshift128_m512(or512(pshufb_m512(dup_mask[4], lo),                \
+                               pshufb_m512(dup_mask[5], hi)),               \
+                         2), SHIFT_OR_M2)
+
+#define SHIFT_OR_M4                                                         \
+    or512(lshift128_m512(or512(pshufb_m512(dup_mask[6], lo),                \
+                               pshufb_m512(dup_mask[7], hi)),               \
+                         3), SHIFT_OR_M3)
+
+static really_inline
+m512 prep_conf_teddy_no_reinforcement_m1(const m512 *lo_mask,
+                                         const m512 *dup_mask,
+                                         const m512 val) {
+    PREP_SHUF_MASK_NO_REINFORCEMENT(val);
+    return SHIFT_OR_M1;
+}
+
+static really_inline
+m512 prep_conf_teddy_no_reinforcement_m2(const m512 *lo_mask,
+                                         const m512 *dup_mask,
+                                         const m512 val) {
+    PREP_SHUF_MASK_NO_REINFORCEMENT(val);
+    return SHIFT_OR_M2;
+}
+
+static really_inline
+m512 prep_conf_teddy_no_reinforcement_m3(const m512 *lo_mask,
+                                         const m512 *dup_mask,
+                                         const m512 val) {
+    PREP_SHUF_MASK_NO_REINFORCEMENT(val);
+    return SHIFT_OR_M3;
+}
+
+static really_inline
+m512 prep_conf_teddy_no_reinforcement_m4(const m512 *lo_mask,
+                                         const m512 *dup_mask,
+                                         const m512 val) {
+    PREP_SHUF_MASK_NO_REINFORCEMENT(val);
+    return SHIFT_OR_M4;
+}
+
+static really_inline
+m512 prep_conf_teddy_m1(const m512 *lo_mask, const m512 *dup_mask,
+                        const u8 *ptr, const u64a *r_msk_base,
+                        u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) {
+    PREP_SHUF_MASK;
+    return or512(SHIFT_OR_M1, r_msk);
+}
+
+static really_inline
+m512 prep_conf_teddy_m2(const m512 *lo_mask, const m512 *dup_mask,
+                        const u8 *ptr, const u64a *r_msk_base,
+                        u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) {
+    PREP_SHUF_MASK;
+    return or512(SHIFT_OR_M2, r_msk);
+}
+
+static really_inline
+m512 prep_conf_teddy_m3(const m512 *lo_mask, const m512 *dup_mask,
+                        const u8 *ptr, const u64a *r_msk_base,
+                        u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) {
+    PREP_SHUF_MASK;
+    return or512(SHIFT_OR_M3, r_msk);
+}
+
+static really_inline
+m512 prep_conf_teddy_m4(const m512 *lo_mask, const m512 *dup_mask,
+                        const u8 *ptr, const u64a *r_msk_base,
+                        u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) {
+    PREP_SHUF_MASK;
+    return or512(SHIFT_OR_M4, r_msk);
+}
+
+#define PREP_CONF_FN_NO_REINFORCEMENT(val, n)                                 \
+    prep_conf_teddy_no_reinforcement_m##n(&lo_mask, dup_mask, val)
+
+#define PREP_CONF_FN(ptr, n)                                                  \
+    prep_conf_teddy_m##n(&lo_mask, dup_mask, ptr, r_msk_base,                 \
+                         &c_0, &c_16, &c_32, &c_48)
+
+#define PREPARE_MASKS_1                                                       \
+    dup_mask[0] = set4x128(maskBase[0]);                                      \
+    dup_mask[1] = set4x128(maskBase[1]);
+
+#define PREPARE_MASKS_2                                                       \
+    PREPARE_MASKS_1                                                           \
+    dup_mask[2] = set4x128(maskBase[2]);                                      \
+    dup_mask[3] = set4x128(maskBase[3]);
+
+#define PREPARE_MASKS_3                                                       \
+    PREPARE_MASKS_2                                                           \
+    dup_mask[4] = set4x128(maskBase[4]);                                      \
+    dup_mask[5] = set4x128(maskBase[5]);
+
+#define PREPARE_MASKS_4                                                       \
+    PREPARE_MASKS_3                                                           \
+    dup_mask[6] = set4x128(maskBase[6]);                                      \
+    dup_mask[7] = set4x128(maskBase[7]);
+
+#define PREPARE_MASKS(n)                                                      \
+    m512 lo_mask = set64x8(0xf);                                              \
+    m512 dup_mask[n * 2];                                                     \
+    PREPARE_MASKS_##n
+
+#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn)                       \
+do {                                                                          \
+    const u8 *buf_end = a->buf + a->len;                                      \
+    const u8 *ptr = a->buf + a->start_offset;                                 \
+    u32 floodBackoff = FLOOD_BACKOFF_START;                                   \
+    const u8 *tryFloodDetect = a->firstFloodDetect;                           \
+    u32 last_match = ones_u32;                                                \
+    const struct Teddy *teddy = (const struct Teddy *)fdr;                    \
+    const size_t iterBytes = 128;                                             \
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",                 \
+                 a->buf, a->len, a->start_offset);                            \
+                                                                              \
+    const m128 *maskBase = getMaskBase(teddy);                                \
+    PREPARE_MASKS(n_msk);                                                     \
+    const u32 *confBase = getConfBase(teddy);                                 \
+                                                                              \
+    const u64a *r_msk_base = getReinforcedMaskBase(teddy, n_msk);             \
+    u32 c_0 = 0x100;                                                          \
+    u32 c_16 = 0x100;                                                         \
+    u32 c_32 = 0x100;                                                         \
+    u32 c_48 = 0x100;                                                         \
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 64);                               \
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);           \
+    if (ptr < mainStart) {                                                    \
+        ptr = mainStart - 64;                                                 \
+        m512 p_mask;                                                          \
+        m512 val_0 = vectoredLoad512(&p_mask, ptr, a->start_offset,           \
+                                     a->buf, buf_end,                         \
+                                     a->buf_history, a->len_history, n_msk);  \
+        m512 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk);               \
+        r_0 = or512(r_0, p_mask);                                             \
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn);                         \
+        ptr += 64;                                                            \
+    }                                                                         \
+                                                                              \
+    if (ptr + 64 <= buf_end) {                                                \
+        m512 r_0 = PREP_CONF_FN(ptr, n_msk);                                  \
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn);                         \
+        ptr += 64;                                                            \
+    }                                                                         \
+                                                                              \
+    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {                    \
+        __builtin_prefetch(ptr + (iterBytes * 4));                            \
+        CHECK_FLOOD;                                                          \
+        m512 r_0 = PREP_CONF_FN(ptr, n_msk);                                  \
+        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn);                      \
+        m512 r_1 = PREP_CONF_FN(ptr + 64, n_msk);                             \
+        CONFIRM_TEDDY(r_1, 8, 64, NOT_CAUTIOUS, conf_fn);                     \
+    }                                                                         \
+                                                                              \
+    if (ptr + 64 <= buf_end) {                                                \
+        m512 r_0 = PREP_CONF_FN(ptr, n_msk);                                  \
+        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn);                      \
+        ptr += 64;                                                            \
+    }                                                                         \
+                                                                              \
+    assert(ptr + 64 > buf_end);                                               \
+    if (ptr < buf_end) {                                                      \
+        m512 p_mask;                                                          \
+        m512 val_0 = vectoredLoad512(&p_mask, ptr, 0, ptr, buf_end,           \
+                                     a->buf_history, a->len_history, n_msk);  \
+        m512 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk);               \
+        r_0 = or512(r_0, p_mask);                                             \
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn);                         \
+    }                                                                         \
+                                                                              \
+    return HWLM_SUCCESS;                                                      \
+} while(0)
+
+#elif defined(HAVE_AVX2) // not HAVE_AVX512 but HAVE_AVX2 reinforced teddy
+
+#ifdef ARCH_64_BIT
+#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn)                 \
+do {                                                                        \
+    if (unlikely(diff256(var, ones256()))) {                                \
+        m128 lo = movdq_lo(var);                                            \
+        m128 hi = movdq_hi(var);                                            \
+        u64a part1 = movq(lo);                                              \
+        u64a part2 = movq(rshiftbyte_m128(lo, 8));                          \
+        u64a part3 = movq(hi);                                              \
+        u64a part4 = movq(rshiftbyte_m128(hi, 8));                          \
+        CONF_CHUNK_64(part1, bucket, offset, reason, conf_fn);              \
+        CONF_CHUNK_64(part2, bucket, offset + 8, reason, conf_fn);          \
+        CONF_CHUNK_64(part3, bucket, offset + 16, reason, conf_fn);         \
+        CONF_CHUNK_64(part4, bucket, offset + 24, reason, conf_fn);         \
+    }                                                                       \
+} while(0)
+#else
+#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn)                 \
+do {                                                                        \
+    if (unlikely(diff256(var, ones256()))) {                                \
+        m128 lo = movdq_lo(var);                                            \
+        m128 hi = movdq_hi(var);                                            \
+        u32 part1 = movd(lo);                                               \
+        u32 part2 = movd(rshiftbyte_m128(lo, 4));                           \
+        u32 part3 = movd(rshiftbyte_m128(lo, 8));                           \
+        u32 part4 = movd(rshiftbyte_m128(lo, 12));                          \
+        u32 part5 = movd(hi);                                               \
+        u32 part6 = movd(rshiftbyte_m128(hi, 4));                           \
+        u32 part7 = movd(rshiftbyte_m128(hi, 8));                           \
+        u32 part8 = movd(rshiftbyte_m128(hi, 12));                          \
+        CONF_CHUNK_32(part1, bucket, offset, reason, conf_fn);              \
+        CONF_CHUNK_32(part2, bucket, offset + 4, reason, conf_fn);          \
+        CONF_CHUNK_32(part3, bucket, offset + 8, reason, conf_fn);          \
+        CONF_CHUNK_32(part4, bucket, offset + 12, reason, conf_fn);         \
+        CONF_CHUNK_32(part5, bucket, offset + 16, reason, conf_fn);         \
+        CONF_CHUNK_32(part6, bucket, offset + 20, reason, conf_fn);         \
+        CONF_CHUNK_32(part7, bucket, offset + 24, reason, conf_fn);         \
+        CONF_CHUNK_32(part8, bucket, offset + 28, reason, conf_fn);         \
+    }                                                                       \
+} while(0)
+#endif
+
+#define PREP_SHUF_MASK_NO_REINFORCEMENT(val)                                \
+    m256 lo = and256(val, *lo_mask);                                        \
+    m256 hi = and256(rshift64_m256(val, 4), *lo_mask)
+
+#define PREP_SHUF_MASK                                                      \
+    PREP_SHUF_MASK_NO_REINFORCEMENT(load256(ptr));                          \
+    *c_128 = *(ptr + 15);                                                   \
+    m256 r_msk = set64x4(0ULL, r_msk_base[*c_128], 0ULL, r_msk_base[*c_0]); \
+    *c_0 = *(ptr + 31)
+
+#define SHIFT_OR_M1                                                         \
+    or256(pshufb_m256(dup_mask[0], lo), pshufb_m256(dup_mask[1], hi))
+
+#define SHIFT_OR_M2                                                         \
+    or256(lshift128_m256(or256(pshufb_m256(dup_mask[2], lo),                \
+                               pshufb_m256(dup_mask[3], hi)),               \
+                         1), SHIFT_OR_M1)
+
+#define SHIFT_OR_M3                                                         \
+    or256(lshift128_m256(or256(pshufb_m256(dup_mask[4], lo),                \
+                               pshufb_m256(dup_mask[5], hi)),               \
+                         2), SHIFT_OR_M2)
+
+#define SHIFT_OR_M4                                                         \
+    or256(lshift128_m256(or256(pshufb_m256(dup_mask[6], lo),                \
+                               pshufb_m256(dup_mask[7], hi)),               \
+                         3), SHIFT_OR_M3)
+
+static really_inline
+m256 prep_conf_teddy_no_reinforcement_m1(const m256 *lo_mask,
+                                         const m256 *dup_mask,
+                                         const m256 val) {
+    PREP_SHUF_MASK_NO_REINFORCEMENT(val);
+    return SHIFT_OR_M1;
+}
+
+static really_inline
+m256 prep_conf_teddy_no_reinforcement_m2(const m256 *lo_mask,
+                                         const m256 *dup_mask,
+                                         const m256 val) {
+    PREP_SHUF_MASK_NO_REINFORCEMENT(val);
+    return SHIFT_OR_M2;
+}
+
+static really_inline
+m256 prep_conf_teddy_no_reinforcement_m3(const m256 *lo_mask,
+                                         const m256 *dup_mask,
+                                         const m256 val) {
+    PREP_SHUF_MASK_NO_REINFORCEMENT(val);
+    return SHIFT_OR_M3;
+}
+
+static really_inline
+m256 prep_conf_teddy_no_reinforcement_m4(const m256 *lo_mask,
+                                         const m256 *dup_mask,
+                                         const m256 val) {
+    PREP_SHUF_MASK_NO_REINFORCEMENT(val);
+    return SHIFT_OR_M4;
+}
+
+static really_inline
+m256 prep_conf_teddy_m1(const m256 *lo_mask, const m256 *dup_mask,
+                        const u8 *ptr, const u64a *r_msk_base,
+                        u32 *c_0, u32 *c_128) {
+    PREP_SHUF_MASK;
+    return or256(SHIFT_OR_M1, r_msk);
+}
+
+static really_inline
+m256 prep_conf_teddy_m2(const m256 *lo_mask, const m256 *dup_mask,
+                        const u8 *ptr, const u64a *r_msk_base,
+                        u32 *c_0, u32 *c_128) {
+    PREP_SHUF_MASK;
+    return or256(SHIFT_OR_M2, r_msk);
+}
+
+static really_inline
+m256 prep_conf_teddy_m3(const m256 *lo_mask, const m256 *dup_mask,
+                        const u8 *ptr, const u64a *r_msk_base,
+                        u32 *c_0, u32 *c_128) {
+    PREP_SHUF_MASK;
+    return or256(SHIFT_OR_M3, r_msk);
+}
+
+static really_inline
+m256 prep_conf_teddy_m4(const m256 *lo_mask, const m256 *dup_mask,
+                        const u8 *ptr, const u64a *r_msk_base,
+                        u32 *c_0, u32 *c_128) {
+    PREP_SHUF_MASK;
+    return or256(SHIFT_OR_M4, r_msk);
+}
+
+#define PREP_CONF_FN_NO_REINFORCEMENT(val, n)                                 \
+    prep_conf_teddy_no_reinforcement_m##n(&lo_mask, dup_mask, val)
+
+#define PREP_CONF_FN(ptr, n)                                                  \
+    prep_conf_teddy_m##n(&lo_mask, dup_mask, ptr, r_msk_base, &c_0, &c_128)
+
+#define PREPARE_MASKS_1                                                       \
+    dup_mask[0] = set2x128(maskBase[0]);                                      \
+    dup_mask[1] = set2x128(maskBase[1]);
+
+#define PREPARE_MASKS_2                                                       \
+    PREPARE_MASKS_1                                                           \
+    dup_mask[2] = set2x128(maskBase[2]);                                      \
+    dup_mask[3] = set2x128(maskBase[3]);
+
+#define PREPARE_MASKS_3                                                       \
+    PREPARE_MASKS_2                                                           \
+    dup_mask[4] = set2x128(maskBase[4]);                                      \
+    dup_mask[5] = set2x128(maskBase[5]);
+
+#define PREPARE_MASKS_4                                                       \
+    PREPARE_MASKS_3                                                           \
+    dup_mask[6] = set2x128(maskBase[6]);                                      \
+    dup_mask[7] = set2x128(maskBase[7]);
+
+#define PREPARE_MASKS(n)                                                      \
+    m256 lo_mask = set32x8(0xf);                                              \
+    m256 dup_mask[n * 2];                                                     \
+    PREPARE_MASKS_##n
+
+#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn)                       \
+do {                                                                          \
+    const u8 *buf_end = a->buf + a->len;                                      \
+    const u8 *ptr = a->buf + a->start_offset;                                 \
+    u32 floodBackoff = FLOOD_BACKOFF_START;                                   \
+    const u8 *tryFloodDetect = a->firstFloodDetect;                           \
+    u32 last_match = ones_u32;                                                \
+    const struct Teddy *teddy = (const struct Teddy *)fdr;                    \
+    const size_t iterBytes = 64;                                              \
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",                 \
+                 a->buf, a->len, a->start_offset);                            \
+                                                                              \
+    const m128 *maskBase = getMaskBase(teddy);                                \
+    PREPARE_MASKS(n_msk);                                                     \
+    const u32 *confBase = getConfBase(teddy);                                 \
+                                                                              \
+    const u64a *r_msk_base = getReinforcedMaskBase(teddy, n_msk);             \
+    u32 c_0 = 0x100;                                                          \
+    u32 c_128 = 0x100;                                                        \
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 32);                               \
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);           \
+    if (ptr < mainStart) {                                                    \
+        ptr = mainStart - 32;                                                 \
+        m256 p_mask;                                                          \
+        m256 val_0 = vectoredLoad256(&p_mask, ptr, a->start_offset,           \
+                                     a->buf, buf_end,                         \
+                                     a->buf_history, a->len_history, n_msk);  \
+        m256 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk);               \
+        r_0 = or256(r_0, p_mask);                                             \
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn);                         \
+        ptr += 32;                                                            \
+    }                                                                         \
+                                                                              \
+    if (ptr + 32 <= buf_end) {                                                \
+        m256 r_0 = PREP_CONF_FN(ptr, n_msk);                                  \
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn);                         \
+        ptr += 32;                                                            \
+    }                                                                         \
+                                                                              \
+    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {                    \
+        __builtin_prefetch(ptr + (iterBytes * 4));                            \
+        CHECK_FLOOD;                                                          \
+        m256 r_0 = PREP_CONF_FN(ptr, n_msk);                                  \
+        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn);                      \
+        m256 r_1 = PREP_CONF_FN(ptr + 32, n_msk);                             \
+        CONFIRM_TEDDY(r_1, 8, 32, NOT_CAUTIOUS, conf_fn);                     \
+    }                                                                         \
+                                                                              \
+    if (ptr + 32 <= buf_end) {                                                \
+        m256 r_0 = PREP_CONF_FN(ptr, n_msk);                                  \
+        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn);                      \
+        ptr += 32;                                                            \
+    }                                                                         \
+                                                                              \
+    assert(ptr + 32 > buf_end);                                               \
+    if (ptr < buf_end) {                                                      \
+        m256 p_mask;                                                          \
+        m256 val_0 = vectoredLoad256(&p_mask, ptr, 0, ptr, buf_end,           \
+                                     a->buf_history, a->len_history, n_msk);  \
+        m256 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk);               \
+        r_0 = or256(r_0, p_mask);                                             \
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn);                         \
+    }                                                                         \
+                                                                              \
+    return HWLM_SUCCESS;                                                      \
+} while(0)
+
+#else // not defined HAVE_AVX2
+
+#ifdef ARCH_64_BIT
+#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn)                 \
+do {                                                                        \
+    if (unlikely(diff128(var, ones128()))) {                                \
+        u64a lo = movq(var);                                                \
+        u64a hi = movq(rshiftbyte_m128(var, 8));                            \
+        CONF_CHUNK_64(lo, bucket, offset, reason, conf_fn);                 \
+        CONF_CHUNK_64(hi, bucket, offset + 8, reason, conf_fn);             \
+    }                                                                       \
+} while(0)
+#else
+#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn)                 \
+do {                                                                        \
+    if (unlikely(diff128(var, ones128()))) {                                \
+        u32 part1 = movd(var);                                              \
+        u32 part2 = movd(rshiftbyte_m128(var, 4));                          \
+        u32 part3 = movd(rshiftbyte_m128(var, 8));                          \
+        u32 part4 = movd(rshiftbyte_m128(var, 12));                         \
+        CONF_CHUNK_32(part1, bucket, offset, reason, conf_fn);              \
+        CONF_CHUNK_32(part2, bucket, offset + 4, reason, conf_fn);          \
+        CONF_CHUNK_32(part3, bucket, offset + 8, reason, conf_fn);          \
+        CONF_CHUNK_32(part4, bucket, offset + 12, reason, conf_fn);         \
+    }                                                                       \
+} while(0)
+#endif
+
+static really_inline
+m128 prep_conf_teddy_m1(const m128 *maskBase, m128 val) {
+    m128 mask = set16x8(0xf);
+    m128 lo = and128(val, mask);
+    m128 hi = and128(rshift64_m128(val, 4), mask);
+    return or128(pshufb_m128(maskBase[0 * 2], lo),
+                 pshufb_m128(maskBase[0 * 2 + 1], hi));
+}
+
+static really_inline
+m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 val) {
+    m128 mask = set16x8(0xf);
+    m128 lo = and128(val, mask);
+    m128 hi = and128(rshift64_m128(val, 4), mask);
+    m128 r = prep_conf_teddy_m1(maskBase, val);
+
+    m128 res_1 = or128(pshufb_m128(maskBase[1 * 2], lo),
+                       pshufb_m128(maskBase[1 * 2 + 1], hi));
+    m128 res_shifted_1 = palignr(res_1, *old_1, 16 - 1);
+    *old_1 = res_1;
+    return or128(r, res_shifted_1);
+}
+
+static really_inline
+m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2,
+                        m128 val) {
+    m128 mask = set16x8(0xf);
+    m128 lo = and128(val, mask);
+    m128 hi = and128(rshift64_m128(val, 4), mask);
+    m128 r = prep_conf_teddy_m2(maskBase, old_1, val);
+
+    m128 res_2 = or128(pshufb_m128(maskBase[2 * 2], lo),
+                       pshufb_m128(maskBase[2 * 2 + 1], hi));
+    m128 res_shifted_2 = palignr(res_2, *old_2, 16 - 2);
+    *old_2 = res_2;
+    return or128(r, res_shifted_2);
+}
+
+static really_inline
+m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2,
+                        m128 *old_3, m128 val) {
+    m128 mask = set16x8(0xf);
+    m128 lo = and128(val, mask);
+    m128 hi = and128(rshift64_m128(val, 4), mask);
+    m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, val);
+
+    m128 res_3 = or128(pshufb_m128(maskBase[3 * 2], lo),
+                       pshufb_m128(maskBase[3 * 2 + 1], hi));
+    m128 res_shifted_3 = palignr(res_3, *old_3, 16 - 3);
+    *old_3 = res_3;
+    return or128(r, res_shifted_3);
+}
+
+#define FDR_EXEC_TEDDY_RES_OLD_1
+
+#define FDR_EXEC_TEDDY_RES_OLD_2                                              \
+    m128 res_old_1 = zeroes128();
+
+#define FDR_EXEC_TEDDY_RES_OLD_3                                              \
+    m128 res_old_1 = zeroes128();                                             \
+    m128 res_old_2 = zeroes128();
+
+#define FDR_EXEC_TEDDY_RES_OLD_4                                              \
+    m128 res_old_1 = zeroes128();                                             \
+    m128 res_old_2 = zeroes128();                                             \
+    m128 res_old_3 = zeroes128();
+
+#define FDR_EXEC_TEDDY_RES_OLD(n) FDR_EXEC_TEDDY_RES_OLD_##n
+
+#define PREP_CONF_FN_1(mask_base, val)                                        \
+    prep_conf_teddy_m1(mask_base, val)
+
+#define PREP_CONF_FN_2(mask_base, val)                                        \
+    prep_conf_teddy_m2(mask_base, &res_old_1, val)
+
+#define PREP_CONF_FN_3(mask_base, val)                                        \
+    prep_conf_teddy_m3(mask_base, &res_old_1, &res_old_2, val)
+
+#define PREP_CONF_FN_4(mask_base, val)                                        \
+    prep_conf_teddy_m4(mask_base, &res_old_1, &res_old_2, &res_old_3, val)
+
+#define PREP_CONF_FN(mask_base, val, n)                                       \
+    PREP_CONF_FN_##n(mask_base, val)
+
+#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn)                       \
+do {                                                                          \
+    const u8 *buf_end = a->buf + a->len;                                      \
+    const u8 *ptr = a->buf + a->start_offset;                                 \
+    u32 floodBackoff = FLOOD_BACKOFF_START;                                   \
+    const u8 *tryFloodDetect = a->firstFloodDetect;                           \
+    u32 last_match = ones_u32;                                                \
+    const struct Teddy *teddy = (const struct Teddy *)fdr;                    \
+    const size_t iterBytes = 32;                                              \
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",                 \
+                 a->buf, a->len, a->start_offset);                            \
+                                                                              \
+    const m128 *maskBase = getMaskBase(teddy);                                \
+    const u32 *confBase = getConfBase(teddy);                                 \
+                                                                              \
+    FDR_EXEC_TEDDY_RES_OLD(n_msk);                                            \
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);                               \
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);           \
+    if (ptr < mainStart) {                                                    \
+        ptr = mainStart - 16;                                                 \
+        m128 p_mask;                                                          \
+        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->start_offset,           \
+                                     a->buf, buf_end,                         \
+                                     a->buf_history, a->len_history, n_msk);  \
+        m128 r_0 = PREP_CONF_FN(maskBase, val_0, n_msk);                      \
+        r_0 = or128(r_0, p_mask);                                             \
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn);                         \
+        ptr += 16;                                                            \
+    }                                                                         \
+                                                                              \
+    if (ptr + 16 <= buf_end) {                                                \
+        m128 r_0 = PREP_CONF_FN(maskBase, load128(ptr), n_msk);               \
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn);                         \
+        ptr += 16;                                                            \
+    }                                                                         \
+                                                                              \
+    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {                    \
+        __builtin_prefetch(ptr + (iterBytes * 4));                            \
+        CHECK_FLOOD;                                                          \
+        m128 r_0 = PREP_CONF_FN(maskBase, load128(ptr), n_msk);               \
+        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn);                      \
+        m128 r_1 = PREP_CONF_FN(maskBase, load128(ptr + 16), n_msk);          \
+        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, conf_fn);                     \
+    }                                                                         \
+                                                                              \
+    if (ptr + 16 <= buf_end) {                                                \
+        m128 r_0 = PREP_CONF_FN(maskBase, load128(ptr), n_msk);               \
+        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn);                      \
+        ptr += 16;                                                            \
+    }                                                                         \
+                                                                              \
+    assert(ptr + 16 > buf_end);                                               \
+    if (ptr < buf_end) {                                                      \
+        m128 p_mask;                                                          \
+        m128 val_0 = vectoredLoad128(&p_mask, ptr, 0, ptr, buf_end,           \
+                                     a->buf_history, a->len_history, n_msk);  \
+        m128 r_0 = PREP_CONF_FN(maskBase, val_0, n_msk);                      \
+        r_0 = or128(r_0, p_mask);                                             \
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn);                         \
+    }                                                                         \
+                                                                              \
+    return HWLM_SUCCESS;                                                      \
+} while(0)
+
+#endif // HAVE_AVX2 HAVE_AVX512
+
+hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
+                                  const struct FDR_Runtime_Args *a,
+                                  hwlm_group_t control) {
+    FDR_EXEC_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
+}
+
+hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
+    FDR_EXEC_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
+}
+
+hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
+                                  const struct FDR_Runtime_Args *a,
+                                  hwlm_group_t control) {
+    FDR_EXEC_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
+}
+
+hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
+    FDR_EXEC_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
+}
+
+hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
+                                  const struct FDR_Runtime_Args *a,
+                                  hwlm_group_t control) {
+    FDR_EXEC_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
+}
+
+hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
+    FDR_EXEC_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
+}
+
+hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
+                                  const struct FDR_Runtime_Args *a,
+                                  hwlm_group_t control) {
+    FDR_EXEC_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
+}
+
+hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
+    FDR_EXEC_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
+}
diff --git a/regex/fdr/teddy.h b/regex/fdr/teddy.h
new file mode 100644
index 000000000..40ae07562
--- /dev/null
+++ b/regex/fdr/teddy.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2016-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Teddy literal matcher: function declarations.
+ */
+
+#ifndef TEDDY_H_
+#define TEDDY_H_
+
+#include "hwlm/hwlm.h" // for hwlm_group_t
+#include "util/arch.h"
+
+struct FDR; // forward declaration from fdr_internal.h
+struct FDR_Runtime_Args;
+
+hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
+                                  const struct FDR_Runtime_Args *a,
+                                  hwlm_group_t control);
+
+hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control);
+
+hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
+                                  const struct FDR_Runtime_Args *a,
+                                  hwlm_group_t control);
+
+hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control);
+
+hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
+                                  const struct FDR_Runtime_Args *a,
+                                  hwlm_group_t control);
+
+hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control);
+
+hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
+                                  const struct FDR_Runtime_Args *a,
+                                  hwlm_group_t control);
+
+hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control);
+
+#if defined(HAVE_AVX2)
+
+hwlm_error_t fdr_exec_fat_teddy_msks1(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control);
+
+hwlm_error_t fdr_exec_fat_teddy_msks1_pck(const struct FDR *fdr,
+                                          const struct FDR_Runtime_Args *a,
+                                          hwlm_group_t control);
+
+hwlm_error_t fdr_exec_fat_teddy_msks2(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control);
+
+hwlm_error_t fdr_exec_fat_teddy_msks2_pck(const struct FDR *fdr,
+                                          const struct FDR_Runtime_Args *a,
+                                          hwlm_group_t control);
+
+hwlm_error_t fdr_exec_fat_teddy_msks3(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control);
+
+hwlm_error_t fdr_exec_fat_teddy_msks3_pck(const struct FDR *fdr,
+                                          const struct FDR_Runtime_Args *a,
+                                          hwlm_group_t control);
+
+hwlm_error_t fdr_exec_fat_teddy_msks4(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control);
+
+hwlm_error_t fdr_exec_fat_teddy_msks4_pck(const struct FDR *fdr,
+                                          const struct FDR_Runtime_Args *a,
+                                          hwlm_group_t control);
+
+#endif /* HAVE_AVX2 */
+
+#endif /* TEDDY_H_ */
diff --git a/regex/fdr/teddy_avx2.c b/regex/fdr/teddy_avx2.c
new file mode 100644
index 000000000..6a6b27a5f
--- /dev/null
+++ b/regex/fdr/teddy_avx2.c
@@ -0,0 +1,709 @@
+/*
+ * Copyright (c) 2016-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Teddy literal matcher: AVX2 engine runtime.
+ */
+
+#include "fdr_internal.h"
+#include "flood_runtime.h"
+#include "teddy.h"
+#include "teddy_internal.h"
+#include "teddy_runtime_common.h"
+#include "util/arch.h"
+#include "util/simd_utils.h"
+
+#if defined(HAVE_AVX2)
+
+const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64] = {
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
+};
+
+#if defined(HAVE_AVX512VBMI) // VBMI strong fat teddy
+
+#define CONF_FAT_CHUNK_64(chunk, bucket, off, reason, pt, conf_fn)          \
+do {                                                                        \
+    if (unlikely(chunk != ones_u64a)) {                                     \
+        chunk = ~chunk;                                                     \
+        conf_fn(&chunk, bucket, off, confBase, reason, a, pt,               \
+                &control, &last_match);                                     \
+        CHECK_HWLM_TERMINATE_MATCHING;                                      \
+    }                                                                       \
+} while(0)
+
+#define CONF_FAT_CHUNK_32(chunk, bucket, off, reason, pt, conf_fn)          \
+do {                                                                        \
+    if (unlikely(chunk != ones_u32)) {                                      \
+        chunk = ~chunk;                                                     \
+        conf_fn(&chunk, bucket, off, confBase, reason, a, pt,               \
+                &control, &last_match);                                     \
+        CHECK_HWLM_TERMINATE_MATCHING;                                      \
+    }                                                                       \
+} while(0)
+
+static really_inline
+const m512 *getDupMaskBase(const struct Teddy *teddy, u8 numMask) {
+    return (const m512 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy))
+                          + ROUNDUP_CL(2 * numMask * sizeof(m256)));
+}
+
+#else
+
+#define CONF_FAT_CHUNK_64(chunk, bucket, off, reason, conf_fn)              \
+do {                                                                        \
+    if (unlikely(chunk != ones_u64a)) {                                     \
+        chunk = ~chunk;                                                     \
+        conf_fn(&chunk, bucket, off, confBase, reason, a, ptr,              \
+                &control, &last_match);                                     \
+        CHECK_HWLM_TERMINATE_MATCHING;                                      \
+    }                                                                       \
+} while(0)
+
+#define CONF_FAT_CHUNK_32(chunk, bucket, off, reason, conf_fn)              \
+do {                                                                        \
+    if (unlikely(chunk != ones_u32)) {                                      \
+        chunk = ~chunk;                                                     \
+        conf_fn(&chunk, bucket, off, confBase, reason, a, ptr,              \
+                &control, &last_match);                                     \
+        CHECK_HWLM_TERMINATE_MATCHING;                                      \
+    }                                                                       \
+} while(0)
+
+static really_inline
+const m256 *getMaskBase_fat(const struct Teddy *teddy) {
+    return (const m256 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
+}
+
+#endif
+
+#if defined(HAVE_AVX512VBMI) // VBMI strong fat teddy
+
+const u8 ALIGN_AVX_DIRECTIVE p_mask_interleave[64] = {
+    0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
+    8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
+    16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,
+    24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
+};
+
+#ifdef ARCH_64_BIT
+#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, pt, conf_fn)         \
+do {                                                                        \
+    if (unlikely(diff512(var, ones512()))) {                                \
+        m512 msk_interleave = load512(p_mask_interleave);                   \
+        m512 r = vpermb512(msk_interleave, var);                            \
+        m128 r0 = extract128from512(r, 0);                                  \
+        m128 r1 = extract128from512(r, 1);                                  \
+        m128 r2 = extract128from512(r, 2);                                  \
+        m128 r3 = extract128from512(r, 3);                                  \
+        u64a part1 = movq(r0);                                              \
+        u64a part2 = extract64from128(r0, 1);                               \
+        u64a part3 = movq(r1);                                              \
+        u64a part4 = extract64from128(r1, 1);                               \
+        u64a part5 = movq(r2);                                              \
+        u64a part6 = extract64from128(r2, 1);                               \
+        u64a part7 = movq(r3);                                              \
+        u64a part8 = extract64from128(r3, 1);                               \
+        CONF_FAT_CHUNK_64(part1, bucket, offset, reason, pt, conf_fn);      \
+        CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, pt, conf_fn);  \
+        CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, pt, conf_fn);  \
+        CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, pt, conf_fn); \
+        CONF_FAT_CHUNK_64(part5, bucket, offset + 16, reason, pt, conf_fn); \
+        CONF_FAT_CHUNK_64(part6, bucket, offset + 20, reason, pt, conf_fn); \
+        CONF_FAT_CHUNK_64(part7, bucket, offset + 24, reason, pt, conf_fn); \
+        CONF_FAT_CHUNK_64(part8, bucket, offset + 28, reason, pt, conf_fn); \
+    }                                                                       \
+} while(0)
+#else
+#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, pt, conf_fn)         \
+do {                                                                        \
+    if (unlikely(diff512(var, ones512()))) {                                \
+        m512 msk_interleave = load512(p_mask_interleave);                   \
+        m512 r = vpermb512(msk_interleave, var);                            \
+        m128 r0 = extract128from512(r, 0);                                  \
+        m128 r1 = extract128from512(r, 1);                                  \
+        m128 r2 = extract128from512(r, 2);                                  \
+        m128 r3 = extract128from512(r, 3);                                  \
+        u32 part1 = movd(r0);                                               \
+        u32 part2 = extract32from128(r0, 1);                                \
+        u32 part3 = extract32from128(r0, 2);                                \
+        u32 part4 = extract32from128(r0, 3);                                \
+        u32 part5 = movd(r1);                                               \
+        u32 part6 = extract32from128(r1, 1);                                \
+        u32 part7 = extract32from128(r1, 2);                                \
+        u32 part8 = extract32from128(r1, 3);                                \
+        u32 part9 = movd(r2);                                               \
+        u32 part10 = extract32from128(r2, 1);                               \
+        u32 part11 = extract32from128(r2, 2);                               \
+        u32 part12 = extract32from128(r2, 3);                               \
+        u32 part13 = movd(r3);                                              \
+        u32 part14 = extract32from128(r3, 1);                               \
+        u32 part15 = extract32from128(r3, 2);                               \
+        u32 part16 = extract32from128(r3, 3);                               \
+        CONF_FAT_CHUNK_32(part1, bucket, offset, reason, pt, conf_fn);      \
+        CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, pt, conf_fn);  \
+        CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, pt, conf_fn);  \
+        CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, pt, conf_fn);  \
+        CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, pt, conf_fn);  \
+        CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, pt, conf_fn); \
+        CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, pt, conf_fn); \
+        CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, pt, conf_fn); \
+        CONF_FAT_CHUNK_32(part9, bucket, offset + 16, reason, pt, conf_fn); \
+        CONF_FAT_CHUNK_32(part10, bucket, offset + 18, reason, pt, conf_fn);\
+        CONF_FAT_CHUNK_32(part11, bucket, offset + 20, reason, pt, conf_fn);\
+        CONF_FAT_CHUNK_32(part12, bucket, offset + 22, reason, pt, conf_fn);\
+        CONF_FAT_CHUNK_32(part13, bucket, offset + 24, reason, pt, conf_fn);\
+        CONF_FAT_CHUNK_32(part14, bucket, offset + 26, reason, pt, conf_fn);\
+        CONF_FAT_CHUNK_32(part15, bucket, offset + 28, reason, pt, conf_fn);\
+        CONF_FAT_CHUNK_32(part16, bucket, offset + 30, reason, pt, conf_fn);\
+    }                                                                       \
+} while(0)
+#endif
+
+#define PREP_FAT_SHUF_MASK                                                  \
+    m512 lo = and512(val, *lo_mask);                                        \
+    m512 hi = and512(rshift64_m512(val, 4), *lo_mask)
+
+#define FAT_TEDDY_VBMI_PSHUFB_OR_M1                          \
+    m512 shuf_or_b0 = or512(pshufb_m512(dup_mask[0], lo),    \
+                            pshufb_m512(dup_mask[1], hi));
+
+#define FAT_TEDDY_VBMI_PSHUFB_OR_M2                          \
+    FAT_TEDDY_VBMI_PSHUFB_OR_M1                              \
+    m512 shuf_or_b1 = or512(pshufb_m512(dup_mask[2], lo),    \
+                            pshufb_m512(dup_mask[3], hi));
+
+#define FAT_TEDDY_VBMI_PSHUFB_OR_M3                          \
+    FAT_TEDDY_VBMI_PSHUFB_OR_M2                              \
+    m512 shuf_or_b2 = or512(pshufb_m512(dup_mask[4], lo),    \
+                            pshufb_m512(dup_mask[5], hi));
+
+#define FAT_TEDDY_VBMI_PSHUFB_OR_M4                          \
+    FAT_TEDDY_VBMI_PSHUFB_OR_M3                              \
+    m512 shuf_or_b3 = or512(pshufb_m512(dup_mask[6], lo),    \
+                            pshufb_m512(dup_mask[7], hi));
+
+#define FAT_TEDDY_VBMI_SL1_MASK   0xfffffffefffffffeULL
+#define FAT_TEDDY_VBMI_SL2_MASK   0xfffffffcfffffffcULL
+#define FAT_TEDDY_VBMI_SL3_MASK   0xfffffff8fffffff8ULL
+
+#define FAT_TEDDY_VBMI_SHIFT_M1
+
+#define FAT_TEDDY_VBMI_SHIFT_M2                      \
+    FAT_TEDDY_VBMI_SHIFT_M1                          \
+    m512 sl1 = maskz_vpermb512(FAT_TEDDY_VBMI_SL1_MASK, sl_msk[0], shuf_or_b1);
+
+#define FAT_TEDDY_VBMI_SHIFT_M3                      \
+    FAT_TEDDY_VBMI_SHIFT_M2                          \
+    m512 sl2 = maskz_vpermb512(FAT_TEDDY_VBMI_SL2_MASK, sl_msk[1], shuf_or_b2);
+
+#define FAT_TEDDY_VBMI_SHIFT_M4                      \
+    FAT_TEDDY_VBMI_SHIFT_M3                          \
+    m512 sl3 = maskz_vpermb512(FAT_TEDDY_VBMI_SL3_MASK, sl_msk[2], shuf_or_b3);
+
+#define FAT_SHIFT_OR_M1            \
+    shuf_or_b0
+
+#define FAT_SHIFT_OR_M2            \
+    or512(sl1, FAT_SHIFT_OR_M1)
+
+#define FAT_SHIFT_OR_M3            \
+    or512(sl2, FAT_SHIFT_OR_M2)
+
+#define FAT_SHIFT_OR_M4            \
+    or512(sl3, FAT_SHIFT_OR_M3)
+
+static really_inline
+m512 prep_conf_fat_teddy_m1(const m512 *lo_mask, const m512 *dup_mask,
+                            UNUSED const m512 *sl_msk, const m512 val) {
+    PREP_FAT_SHUF_MASK;
+    FAT_TEDDY_VBMI_PSHUFB_OR_M1;
+    FAT_TEDDY_VBMI_SHIFT_M1;
+    return FAT_SHIFT_OR_M1;
+}
+
+static really_inline
+m512 prep_conf_fat_teddy_m2(const m512 *lo_mask, const m512 *dup_mask,
+                            const m512 *sl_msk, const m512 val) {
+    PREP_FAT_SHUF_MASK;
+    FAT_TEDDY_VBMI_PSHUFB_OR_M2;
+    FAT_TEDDY_VBMI_SHIFT_M2;
+    return FAT_SHIFT_OR_M2;
+}
+
+static really_inline
+m512 prep_conf_fat_teddy_m3(const m512 *lo_mask, const m512 *dup_mask,
+                            const m512 *sl_msk, const m512 val) {
+    PREP_FAT_SHUF_MASK;
+    FAT_TEDDY_VBMI_PSHUFB_OR_M3;
+    FAT_TEDDY_VBMI_SHIFT_M3;
+    return FAT_SHIFT_OR_M3;
+}
+
+static really_inline
+m512 prep_conf_fat_teddy_m4(const m512 *lo_mask, const m512 *dup_mask,
+                            const m512 *sl_msk, const m512 val) {
+    PREP_FAT_SHUF_MASK;
+    FAT_TEDDY_VBMI_PSHUFB_OR_M4;
+    FAT_TEDDY_VBMI_SHIFT_M4;
+    return FAT_SHIFT_OR_M4;
+}
+
+#define PREP_CONF_FAT_FN(val, n)    \
+    prep_conf_fat_teddy_m##n(&lo_mask, dup_mask, sl_msk, val)
+
+#define FAT_TEDDY_VBMI_SL1_POS    15
+#define FAT_TEDDY_VBMI_SL2_POS    14
+#define FAT_TEDDY_VBMI_SL3_POS    13
+
+#define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M1
+
+#define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M2    \
+    FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M1        \
+    sl_msk[0] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL1_POS);
+
+#define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M3    \
+    FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M2        \
+    sl_msk[1] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL2_POS);
+
+#define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M4    \
+    FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M3        \
+    sl_msk[2] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL3_POS);
+
+/*
+ * In FAT teddy, it needs 2 bytes to represent result of each position,
+ * so each nibble's(for example, lo nibble of last byte) FAT teddy mask
+ * has 16x2 bytes:
+ *   |----------------------------------|----------------------------------|
+ *   16bytes (bucket 0..7 in each byte) 16bytes (bucket 8..15 in each byte)
+ *                     A                                  B
+ * at runtime FAT teddy reads 16 bytes once and duplicate them to 32 bytes:
+ *   |----------------------------------|----------------------------------|
+ *   16bytes input data (lo nibbles)    16bytes duplicated data (lo nibbles)
+ *                     X                                  X
+ * then do pshufb_m256(AB, XX).
+ *
+ * In AVX512 reinforced FAT teddy, it reads 32 bytes once and duplicate them
+ * to 64 bytes:
+ *   |----------------|----------------|----------------|----------------|
+ *            X                Y                X                Y
+ * in this case we need DUP_FAT_MASK to construct AABB:
+ *   |----------------|----------------|----------------|----------------|
+ *            A                A                B                B
+ * then do pshufb_m512(AABB, XYXY).
+ */
+
+#define PREPARE_FAT_MASKS(n)                                                  \
+    m512 lo_mask = set64x8(0xf);                                              \
+    m512 sl_msk[n - 1];                                                       \
+    FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M##n
+
+#define FAT_TEDDY_VBMI_CONF_MASK_HEAD   (0xffffffffULL >> n_sh)
+#define FAT_TEDDY_VBMI_CONF_MASK_FULL   ((0xffffffffULL << n_sh) & 0xffffffffULL)
+#define FAT_TEDDY_VBMI_CONF_MASK_VAR(n) (0xffffffffULL >> (32 - n) << overlap)
+#define FAT_TEDDY_VBMI_LOAD_MASK_PATCH  (0xffffffffULL >> (32 - n_sh))
+
+#define FDR_EXEC_FAT_TEDDY(fdr, a, control, n_msk, conf_fn)                   \
+do {                                                                          \
+    const u8 *buf_end = a->buf + a->len;                                      \
+    const u8 *ptr = a->buf + a->start_offset;                                 \
+    u32 floodBackoff = FLOOD_BACKOFF_START;                                   \
+    const u8 *tryFloodDetect = a->firstFloodDetect;                           \
+    u32 last_match = ones_u32;                                                \
+    const struct Teddy *teddy = (const struct Teddy *)fdr;                    \
+    const size_t iterBytes = 32;                                              \
+    u32 n_sh = n_msk - 1;                                                     \
+    const size_t loopBytes = 32 - n_sh;                                       \
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",                 \
+                 a->buf, a->len, a->start_offset);                            \
+                                                                              \
+    const m512 *dup_mask = getDupMaskBase(teddy, n_msk);                      \
+    PREPARE_FAT_MASKS(n_msk);                                                 \
+    const u32 *confBase = getConfBase(teddy);                                 \
+                                                                              \
+    u64a k = FAT_TEDDY_VBMI_CONF_MASK_FULL;                                   \
+    m512 p_mask = set_mask_m512(~((k << 32) | k));                            \
+    u32 overlap = 0;                                                          \
+    u64a patch = 0;                                                           \
+    if (likely(ptr + loopBytes <= buf_end)) {                                 \
+        u64a k0 = FAT_TEDDY_VBMI_CONF_MASK_HEAD;                              \
+        m512 p_mask0 = set_mask_m512(~((k0 << 32) | k0));                     \
+        m512 r_0 = PREP_CONF_FAT_FN(set2x256(loadu256(ptr)), n_msk);          \
+        r_0 = or512(r_0, p_mask0);                                            \
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, ptr, conf_fn);               \
+        ptr += loopBytes;                                                     \
+        overlap = n_sh;                                                       \
+        patch = FAT_TEDDY_VBMI_LOAD_MASK_PATCH;                               \
+    }                                                                         \
+                                                                              \
+    for (; ptr + loopBytes <= buf_end; ptr += loopBytes) {                    \
+        CHECK_FLOOD;                                                          \
+        m512 r_0 = PREP_CONF_FAT_FN(set2x256(loadu256(ptr - n_sh)), n_msk);   \
+        r_0 = or512(r_0, p_mask);                                             \
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, ptr - n_sh, conf_fn);     \
+    }                                                                         \
+                                                                              \
+    assert(ptr + loopBytes > buf_end);                                        \
+    if (ptr < buf_end) {                                                      \
+        u32 left = (u32)(buf_end - ptr);                                      \
+        u64a k1 = FAT_TEDDY_VBMI_CONF_MASK_VAR(left);                         \
+        m512 p_mask1 = set_mask_m512(~((k1 << 32) | k1));                     \
+        m512 val_0 = set2x256(loadu_maskz_m256(k1 | patch, ptr - overlap));   \
+        m512 r_0 = PREP_CONF_FAT_FN(val_0, n_msk);                            \
+        r_0 = or512(r_0, p_mask1);                                            \
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, ptr - overlap, conf_fn);     \
+    }                                                                         \
+                                                                              \
+    return HWLM_SUCCESS;                                                      \
+} while(0)
+
+#else // !HAVE_AVX512VBMI, AVX2 normal fat teddy
+
+#ifdef ARCH_64_BIT
+#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn)             \
+do {                                                                        \
+    if (unlikely(diff256(var, ones256()))) {                                \
+        m256 swap = swap128in256(var);                                      \
+        m256 r = interleave256lo(var, swap);                                \
+        u64a part1 = extractlow64from256(r);                                \
+        u64a part2 = extract64from256(r, 1);                                \
+        r = interleave256hi(var, swap);                                     \
+        u64a part3 = extractlow64from256(r);                                \
+        u64a part4 = extract64from256(r, 1);                                \
+        CONF_FAT_CHUNK_64(part1, bucket, offset, reason, conf_fn);          \
+        CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, conf_fn);      \
+        CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, conf_fn);      \
+        CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, conf_fn);     \
+    }                                                                       \
+} while(0)
+#else
+#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn)             \
+do {                                                                        \
+    if (unlikely(diff256(var, ones256()))) {                                \
+        m256 swap = swap128in256(var);                                      \
+        m256 r = interleave256lo(var, swap);                                \
+        u32 part1 = extractlow32from256(r);                                 \
+        u32 part2 = extract32from256(r, 1);                                 \
+        u32 part3 = extract32from256(r, 2);                                 \
+        u32 part4 = extract32from256(r, 3);                                 \
+        r = interleave256hi(var, swap);                                     \
+        u32 part5 = extractlow32from256(r);                                 \
+        u32 part6 = extract32from256(r, 1);                                 \
+        u32 part7 = extract32from256(r, 2);                                 \
+        u32 part8 = extract32from256(r, 3);                                 \
+        CONF_FAT_CHUNK_32(part1, bucket, offset, reason, conf_fn);          \
+        CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, conf_fn);      \
+        CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, conf_fn);      \
+        CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, conf_fn);      \
+        CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, conf_fn);      \
+        CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, conf_fn);     \
+        CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, conf_fn);     \
+        CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, conf_fn);     \
+    }                                                                       \
+} while(0)
+#endif
+
+static really_inline
+m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const size_t start_offset,
+                       const u8 *lo, const u8 *hi,
+                       const u8 *buf_history, size_t len_history,
+                       const u32 nMasks) {
+    m128 p_mask128;
+    m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi,
+                                        buf_history, len_history, nMasks));
+    *p_mask = set2x128(p_mask128);
+    return ret;
+}
+
+static really_inline
+m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) {
+    m256 mask = set32x8(0xf);
+    m256 lo = and256(val, mask);
+    m256 hi = and256(rshift64_m256(val, 4), mask);
+    return or256(pshufb_m256(maskBase[0 * 2], lo),
+                 pshufb_m256(maskBase[0 * 2 + 1], hi));
+}
+
+static really_inline
+m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) {
+    m256 mask = set32x8(0xf);
+    m256 lo = and256(val, mask);
+    m256 hi = and256(rshift64_m256(val, 4), mask);
+    m256 r = prep_conf_fat_teddy_m1(maskBase, val);
+
+    m256 res_1 = or256(pshufb_m256(maskBase[1 * 2], lo),
+                       pshufb_m256(maskBase[1 * 2 + 1], hi));
+    m256 res_shifted_1 = vpalignr(res_1, *old_1, 16 - 1);
+    *old_1 = res_1;
+    return or256(r, res_shifted_1);
+}
+
+static really_inline
+m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
+                            m256 val) {
+    m256 mask = set32x8(0xf);
+    m256 lo = and256(val, mask);
+    m256 hi = and256(rshift64_m256(val, 4), mask);
+    m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, val);
+
+    m256 res_2 = or256(pshufb_m256(maskBase[2 * 2], lo),
+                       pshufb_m256(maskBase[2 * 2 + 1], hi));
+    m256 res_shifted_2 = vpalignr(res_2, *old_2, 16 - 2);
+    *old_2 = res_2;
+    return or256(r, res_shifted_2);
+}
+
+static really_inline
+m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2,
+                            m256 *old_3, m256 val) {
+    m256 mask = set32x8(0xf);
+    m256 lo = and256(val, mask);
+    m256 hi = and256(rshift64_m256(val, 4), mask);
+    m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, val);
+
+    m256 res_3 = or256(pshufb_m256(maskBase[3 * 2], lo),
+                       pshufb_m256(maskBase[3 * 2 + 1], hi));
+    m256 res_shifted_3 = vpalignr(res_3, *old_3, 16 - 3);
+    *old_3 = res_3;
+    return or256(r, res_shifted_3);
+}
+
+#define FDR_EXEC_FAT_TEDDY_RES_OLD_1                                        \
+do {                                                                        \
+} while(0)
+
+#define FDR_EXEC_FAT_TEDDY_RES_OLD_2                                        \
+    m256 res_old_1 = zeroes256();
+
+#define FDR_EXEC_FAT_TEDDY_RES_OLD_3                                        \
+    m256 res_old_1 = zeroes256();                                           \
+    m256 res_old_2 = zeroes256();
+
+#define FDR_EXEC_FAT_TEDDY_RES_OLD_4                                        \
+    m256 res_old_1 = zeroes256();                                           \
+    m256 res_old_2 = zeroes256();                                           \
+    m256 res_old_3 = zeroes256();
+
+#define FDR_EXEC_FAT_TEDDY_RES_OLD(n) FDR_EXEC_FAT_TEDDY_RES_OLD_##n
+
+#define PREP_CONF_FAT_FN_1(mask_base, val)                                  \
+    prep_conf_fat_teddy_m1(mask_base, val)
+
+#define PREP_CONF_FAT_FN_2(mask_base, val)                                  \
+    prep_conf_fat_teddy_m2(mask_base, &res_old_1, val)
+
+#define PREP_CONF_FAT_FN_3(mask_base, val)                                  \
+    prep_conf_fat_teddy_m3(mask_base, &res_old_1, &res_old_2, val)
+
+#define PREP_CONF_FAT_FN_4(mask_base, val)                                  \
+    prep_conf_fat_teddy_m4(mask_base, &res_old_1, &res_old_2, &res_old_3, val)
+
+#define PREP_CONF_FAT_FN(mask_base, val, n)                                 \
+    PREP_CONF_FAT_FN_##n(mask_base, val)
+
+#define FDR_EXEC_FAT_TEDDY(fdr, a, control, n_msk, conf_fn)                 \
+do {                                                                        \
+    const u8 *buf_end = a->buf + a->len;                                    \
+    const u8 *ptr = a->buf + a->start_offset;                               \
+    u32 floodBackoff = FLOOD_BACKOFF_START;                                 \
+    const u8 *tryFloodDetect = a->firstFloodDetect;                         \
+    u32 last_match = ones_u32;                                              \
+    const struct Teddy *teddy = (const struct Teddy *)fdr;                  \
+    const size_t iterBytes = 32;                                            \
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",               \
+                 a->buf, a->len, a->start_offset);                          \
+                                                                            \
+    const m256 *maskBase = getMaskBase_fat(teddy);                          \
+    const u32 *confBase = getConfBase(teddy);                               \
+                                                                            \
+    FDR_EXEC_FAT_TEDDY_RES_OLD(n_msk);                                      \
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);                             \
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);         \
+    if (ptr < mainStart) {                                                  \
+        ptr = mainStart - 16;                                               \
+        m256 p_mask;                                                        \
+        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->start_offset,       \
+                                       a->buf, buf_end,                     \
+                                       a->buf_history, a->len_history,      \
+                                       n_msk);                              \
+        m256 r_0 = PREP_CONF_FAT_FN(maskBase, val_0, n_msk);                \
+        r_0 = or256(r_0, p_mask);                                           \
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn);                  \
+        ptr += 16;                                                          \
+    }                                                                       \
+                                                                            \
+    if (ptr + 16 <= buf_end) {                                              \
+        m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk);       \
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn);                  \
+        ptr += 16;                                                          \
+    }                                                                       \
+                                                                            \
+    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {                 \
+        __builtin_prefetch(ptr + (iterBytes * 4));                          \
+        CHECK_FLOOD;                                                        \
+        m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk);       \
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn);               \
+        m256 r_1 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr + 16), n_msk);  \
+        CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, conf_fn);              \
+    }                                                                       \
+                                                                            \
+    if (ptr + 16 <= buf_end) {                                              \
+        m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk);       \
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn);               \
+        ptr += 16;                                                          \
+    }                                                                       \
+                                                                            \
+    assert(ptr + 16 > buf_end);                                             \
+    if (ptr < buf_end) {                                                    \
+        m256 p_mask;                                                        \
+        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, 0, ptr, buf_end,       \
+                                       a->buf_history, a->len_history,      \
+                                       n_msk);                              \
+        m256 r_0 = PREP_CONF_FAT_FN(maskBase, val_0, n_msk);                \
+        r_0 = or256(r_0, p_mask);                                           \
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn);                  \
+    }                                                                       \
+                                                                            \
+    return HWLM_SUCCESS;                                                    \
+} while(0)
+
+#endif // HAVE_AVX512VBMI
+
+hwlm_error_t fdr_exec_fat_teddy_msks1(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
+    FDR_EXEC_FAT_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
+}
+
+hwlm_error_t fdr_exec_fat_teddy_msks1_pck(const struct FDR *fdr,
+                                          const struct FDR_Runtime_Args *a,
+                                          hwlm_group_t control) {
+    FDR_EXEC_FAT_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
+}
+
+hwlm_error_t fdr_exec_fat_teddy_msks2(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
+    FDR_EXEC_FAT_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
+}
+
+hwlm_error_t fdr_exec_fat_teddy_msks2_pck(const struct FDR *fdr,
+                                          const struct FDR_Runtime_Args *a,
+                                          hwlm_group_t control) {
+    FDR_EXEC_FAT_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
+}
+
+hwlm_error_t fdr_exec_fat_teddy_msks3(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
+    FDR_EXEC_FAT_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
+}
+
+hwlm_error_t fdr_exec_fat_teddy_msks3_pck(const struct FDR *fdr,
+                                          const struct FDR_Runtime_Args *a,
+                                          hwlm_group_t control) {
+    FDR_EXEC_FAT_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
+}
+
+hwlm_error_t fdr_exec_fat_teddy_msks4(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
+    FDR_EXEC_FAT_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
+}
+
+hwlm_error_t fdr_exec_fat_teddy_msks4_pck(const struct FDR *fdr,
+                                          const struct FDR_Runtime_Args *a,
+                                          hwlm_group_t control) {
+    FDR_EXEC_FAT_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
+}
+
+#endif // HAVE_AVX2
diff --git a/regex/fdr/teddy_internal.h b/regex/fdr/teddy_internal.h
new file mode 100644
index 000000000..1e9e603fa
--- /dev/null
+++ b/regex/fdr/teddy_internal.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Teddy bytecode layout:
+ * * |-----|
+ * * |     | struct Teddy
+ * * |-----|
+ * * |     | teddy masks
+ * * |     |
+ * * |-----|
+ * * |     | reinforcement mask table for bucket 0..7
+ * * |     |
+ * * |-----|
+ * * |     | reinforcement mask table for bucket 8..15 (FAT teddy)
+ * * |     |
+ * * |-----|
+ * * |     | confirm
+ * * |     |
+ * * |     |
+ * * |-----|
+ * * |     | flood control
+ * * |     |
+ * * |-----|
+ */
+
+#ifndef TEDDY_INTERNAL_H
+#define TEDDY_INTERNAL_H
+
+#include "ue2common.h"
+
+// first part is compatible with an FDR
+struct Teddy {
+    u32 engineID;
+    u32 size;
+    u32 maxStringLen;
+    u32 numStrings;
+    u32 confOffset;
+    u32 floodOffset;
+};
+
+#endif
diff --git a/regex/fdr/teddy_runtime_common.h b/regex/fdr/teddy_runtime_common.h
new file mode 100644
index 000000000..b76800eb0
--- /dev/null
+++ b/regex/fdr/teddy_runtime_common.h
@@ -0,0 +1,459 @@
+/*
+ * Copyright (c) 2016-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Teddy literal matcher: common runtime procedures.
+ */
+
+#ifndef TEDDY_RUNTIME_COMMON_H_
+#define TEDDY_RUNTIME_COMMON_H_
+
+#include "fdr_confirm.h"
+#include "fdr_confirm_runtime.h"
+#include "ue2common.h"
+#include "util/bitutils.h"
+#include "util/simd_utils.h"
+#include "util/uniform_ops.h"
+
+extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32];
+#if defined(HAVE_AVX2)
+extern const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64];
+#endif
+
+#if defined(HAVE_AVX512VBMI)
+static const u8 ALIGN_DIRECTIVE p_sh_mask_arr[80] = {
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+    0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f
+};
+#endif
+
+#ifdef ARCH_64_BIT
+#define TEDDY_CONF_TYPE u64a
+#define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_64(conf)
+#else
+#define TEDDY_CONF_TYPE u32
+#define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_32(conf)
+#endif
+
+#define CHECK_HWLM_TERMINATE_MATCHING                                       \
+do {                                                                        \
+    if (unlikely(control == HWLM_TERMINATE_MATCHING)) {                     \
+        return HWLM_TERMINATED;                                             \
+    }                                                                       \
+} while (0);
+
+#define CHECK_FLOOD                                                         \
+do {                                                                        \
+    if (unlikely(ptr > tryFloodDetect)) {                                   \
+        tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect,          \
+                                     &floodBackoff, &control, iterBytes);   \
+        CHECK_HWLM_TERMINATE_MATCHING;                                      \
+    }                                                                       \
+} while (0);
+
+/*
+ * \brief Copy a block of [0,15] bytes efficiently.
+ *
+ * This function is a workaround intended to stop some compilers from
+ * synthesizing a memcpy function call out of the copy of a small number of
+ * bytes that we do in vectoredLoad128.
+ */
+static really_inline
+void copyRuntBlock128(u8 *dst, const u8 *src, size_t len) {
+    switch (len) {
+    case 0:
+        break;
+    case 1:
+        *dst = *src;
+        break;
+    case 2:
+        unaligned_store_u16(dst, unaligned_load_u16(src));
+        break;
+    case 3:
+        unaligned_store_u16(dst, unaligned_load_u16(src));
+        dst[2] = src[2];
+        break;
+    case 4:
+        unaligned_store_u32(dst, unaligned_load_u32(src));
+        break;
+    case 5:
+    case 6:
+    case 7:
+        /* Perform copy with two overlapping 4-byte chunks. */
+        unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4));
+        unaligned_store_u32(dst, unaligned_load_u32(src));
+        break;
+    case 8:
+        unaligned_store_u64a(dst, unaligned_load_u64a(src));
+        break;
+    default:
+        /* Perform copy with two overlapping 8-byte chunks. */
+        assert(len < 16);
+        unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8));
+        unaligned_store_u64a(dst, unaligned_load_u64a(src));
+        break;
+    }
+}
+
+// Note: p_mask is an output param that initialises a poison mask.
+//       *p_mask = load128(p_mask_arr[n] + 16 - m) means:
+//       m byte 0xff in the beginning, followed by n byte 0x00,
+//       then followed by the rest bytes 0xff.
+// ptr >= lo:
+//     no history.
+//     for end/short zone, ptr==lo and start_offset==0
+//     for start zone, see below
+//          lo         ptr                      hi           hi
+//          |----------|-------|----------------|............|
+//          -start     0       -start+offset    MIN(avail,16)
+// p_mask              ffff..ff0000...........00ffff..........
+// ptr < lo:
+//     only start zone.
+//             history
+//          ptr        lo                       hi           hi
+//          |----------|-------|----------------|............|
+//          0          start   start+offset     end(<=16)
+// p_mask   ffff.....ffffff..ff0000...........00ffff..........
+static really_inline
+m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset,
+                     const u8 *lo, const u8 *hi,
+                     const u8 *buf_history, size_t len_history,
+                     const u32 nMasks) {
+    union {
+        u8 val8[16];
+        m128 val128;
+    } u;
+    u.val128 = zeroes128();
+
+    uintptr_t copy_start;
+    uintptr_t copy_len;
+
+    if (ptr >= lo) { // short/end/start zone
+        uintptr_t start = (uintptr_t)(ptr - lo);
+        uintptr_t avail = (uintptr_t)(hi - ptr);
+        if (avail >= 16) {
+            assert(start_offset - start <= 16);
+            *p_mask = loadu128(p_mask_arr[16 - start_offset + start]
+                               + 16 - start_offset + start);
+            return loadu128(ptr);
+        }
+        assert(start_offset - start <= avail);
+        *p_mask = loadu128(p_mask_arr[avail - start_offset + start]
+                           + 16 - start_offset + start);
+        copy_start = 0;
+        copy_len = avail;
+    } else { // start zone
+        uintptr_t need = MIN((uintptr_t)(lo - ptr),
+                             MIN(len_history, nMasks - 1));
+        uintptr_t start = (uintptr_t)(lo - ptr);
+        uintptr_t i;
+        for (i = start - need; i < start; i++) {
+            u.val8[i] = buf_history[len_history - (start - i)];
+        }
+        uintptr_t end = MIN(16, (uintptr_t)(hi - ptr));
+        assert(start + start_offset <= end);
+        *p_mask = loadu128(p_mask_arr[end - start - start_offset]
+                           + 16 - start - start_offset);
+        copy_start = start;
+        copy_len = end - start;
+    }
+
+    // Runt block from the buffer.
+    copyRuntBlock128(&u.val8[copy_start], &ptr[copy_start], copy_len);
+
+    return u.val128;
+}
+
+#if defined(HAVE_AVX2)
+/*
+ * \brief Copy a block of [0,31] bytes efficiently.
+ *
+ * This function is a workaround intended to stop some compilers from
+ * synthesizing a memcpy function call out of the copy of a small number of
+ * bytes that we do in vectoredLoad256.
+ */
+static really_inline
+void copyRuntBlock256(u8 *dst, const u8 *src, size_t len) {
+    switch (len) {
+    case 0:
+        break;
+    case 1:
+        *dst = *src;
+        break;
+    case 2:
+        unaligned_store_u16(dst, unaligned_load_u16(src));
+        break;
+    case 3:
+        unaligned_store_u16(dst, unaligned_load_u16(src));
+        dst[2] = src[2];
+        break;
+    case 4:
+        unaligned_store_u32(dst, unaligned_load_u32(src));
+        break;
+    case 5:
+    case 6:
+    case 7:
+        /* Perform copy with two overlapping 4-byte chunks. */
+        unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4));
+        unaligned_store_u32(dst, unaligned_load_u32(src));
+        break;
+    case 8:
+        unaligned_store_u64a(dst, unaligned_load_u64a(src));
+        break;
+    case 9:
+    case 10:
+    case 11:
+    case 12:
+    case 13:
+    case 14:
+    case 15:
+        /* Perform copy with two overlapping 8-byte chunks. */
+        unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8));
+        unaligned_store_u64a(dst, unaligned_load_u64a(src));
+        break;
+    case 16:
+        storeu128(dst, loadu128(src));
+        break;
+    default:
+        /* Perform copy with two overlapping 16-byte chunks. */
+        assert(len < 32);
+        storeu128(dst + len - 16, loadu128(src + len - 16));
+        storeu128(dst, loadu128(src));
+        break;
+    }
+}
+
+// Note: p_mask is an output param that initialises a poison mask.
+//       *p_mask = load256(p_mask_arr256[n] + 32 - m) means:
+//       m byte 0xff in the beginning, followed by n byte 0x00,
+//       then followed by the rest bytes 0xff.
+// ptr >= lo:
+//     no history.
+//     for end/short zone, ptr==lo and start_offset==0
+//     for start zone, see below
+//          lo         ptr                      hi           hi
+//          |----------|-------|----------------|............|
+//          -start     0       -start+offset    MIN(avail,32)
+// p_mask              ffff..ff0000...........00ffff..........
+// ptr < lo:
+//     only start zone.
+//             history
+//          ptr        lo                       hi           hi
+//          |----------|-------|----------------|............|
+//          0          start   start+offset     end(<=32)
+// p_mask   ffff.....ffffff..ff0000...........00ffff..........
+static really_inline
+m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset,
+                     const u8 *lo, const u8 *hi,
+                     const u8 *buf_history, size_t len_history,
+                     const u32 nMasks) {
+    union {
+        u8 val8[32];
+        m256 val256;
+    } u;
+    u.val256 = zeroes256();
+
+    uintptr_t copy_start;
+    uintptr_t copy_len;
+
+    if (ptr >= lo) { // short/end/start zone
+        uintptr_t start = (uintptr_t)(ptr - lo);
+        uintptr_t avail = (uintptr_t)(hi - ptr);
+        if (avail >= 32) {
+            assert(start_offset - start <= 32);
+            *p_mask = loadu256(p_mask_arr256[32 - start_offset + start]
+                               + 32 - start_offset + start);
+            return loadu256(ptr);
+        }
+        assert(start_offset - start <= avail);
+        *p_mask = loadu256(p_mask_arr256[avail - start_offset + start]
+                           + 32 - start_offset + start);
+        copy_start = 0;
+        copy_len = avail;
+    } else { //start zone
+        uintptr_t need = MIN((uintptr_t)(lo - ptr),
+                             MIN(len_history, nMasks - 1));
+        uintptr_t start = (uintptr_t)(lo - ptr);
+        uintptr_t i;
+        for (i = start - need; i < start; i++) {
+            u.val8[i] = buf_history[len_history - (start - i)];
+        }
+        uintptr_t end = MIN(32, (uintptr_t)(hi - ptr));
+        assert(start + start_offset <= end);
+        *p_mask = loadu256(p_mask_arr256[end - start - start_offset]
+                           + 32 - start - start_offset);
+        copy_start = start;
+        copy_len = end - start;
+    }
+
+    // Runt block from the buffer.
+    copyRuntBlock256(&u.val8[copy_start], &ptr[copy_start], copy_len);
+
+    return u.val256;
+}
+#endif // HAVE_AVX2
+
+#if defined(HAVE_AVX512)
+// Note: p_mask is an output param that initialises a poison mask.
+//       u64a k = ones_u64a << n' >> m'; // m' < n'
+//       *p_mask = set_mask_m512(~k);
+//       means p_mask is consist of:
+//       (n' - m') poison bytes "0xff" at the beginning,
+//       followed by (64 - n') valid bytes "0x00",
+//       then followed by the rest m' poison bytes "0xff".
+// ptr >= lo:
+//     no history.
+//     for end/short zone, ptr==lo and start_offset==0
+//     for start zone, see below
+//          lo         ptr                      hi           hi
+//          |----------|-------|----------------|............|
+//          -start     0       -start+offset    MIN(avail,64)
+// p_mask              ffff..ff0000...........00ffff..........
+// ptr < lo:
+//     only start zone.
+//             history
+//          ptr        lo                       hi           hi
+//          |----------|-------|----------------|............|
+//          0          start   start+offset     end(<=64)
+// p_mask   ffff.....ffffff..ff0000...........00ffff..........
+static really_inline
+m512 vectoredLoad512(m512 *p_mask, const u8 *ptr, const size_t start_offset,
+                     const u8 *lo, const u8 *hi, const u8 *hbuf, size_t hlen,
+                     const u32 nMasks) {
+    m512 val;
+
+    uintptr_t copy_start;
+    uintptr_t copy_len;
+
+    if (ptr >= lo) { // short/end/start zone
+        uintptr_t start = (uintptr_t)(ptr - lo);
+        uintptr_t avail = (uintptr_t)(hi - ptr);
+        if (avail >= 64) {
+            assert(start_offset - start <= 64);
+            u64a k = ones_u64a << (start_offset - start);
+            *p_mask = set_mask_m512(~k);
+            return loadu512(ptr);
+        }
+        assert(start_offset - start <= avail);
+        u64a k = ones_u64a << (64 - avail + start_offset - start)
+                           >> (64 - avail);
+        *p_mask = set_mask_m512(~k);
+        copy_start = 0;
+        copy_len = avail;
+    } else { //start zone
+        uintptr_t need = MIN((uintptr_t)(lo - ptr),
+                             MIN(hlen, nMasks - 1));
+        uintptr_t start = (uintptr_t)(lo - ptr);
+        u64a j = 0x7fffffffffffffffULL >> (63 - need) << (start - need);
+        val = loadu_maskz_m512(j, &hbuf[hlen - start]);
+        uintptr_t end = MIN(64, (uintptr_t)(hi - ptr));
+        assert(start + start_offset <= end);
+        u64a k = ones_u64a << (64 - end + start + start_offset) >> (64 - end);
+        *p_mask = set_mask_m512(~k);
+        copy_start = start;
+        copy_len = end - start;
+    }
+
+    assert(copy_len < 64);
+    assert(copy_len > 0);
+    u64a j = ones_u64a >> (64 - copy_len) << copy_start;
+    val = loadu_mask_m512(val, j, ptr);
+
+    return val;
+}
+#endif // HAVE_AVX512
+
+static really_inline
+u64a getConfVal(const struct FDR_Runtime_Args *a, const u8 *ptr, u32 byte,
+                UNUSED CautionReason reason) {
+    u64a confVal = 0;
+    const u8 *buf = a->buf;
+    size_t len = a->len;
+    const u8 *confirm_loc = ptr + byte - 7;
+#if defined(HAVE_AVX512VBMI)
+    if (likely(confirm_loc >= buf)) {
+#else
+    if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) {
+#endif
+        confVal = lv_u64a(confirm_loc, buf, buf + len);
+    } else { // r == VECTORING, confirm_loc < buf
+        u64a histBytes = a->histBytes;
+        confVal = lv_u64a_ce(confirm_loc, buf, buf + len);
+        // stitch together confVal and history
+        u32 overhang = buf - confirm_loc;
+        histBytes >>= 64 - (overhang * 8);
+        confVal |= histBytes;
+    }
+    return confVal;
+}
+
+static really_inline
+void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
+                          const u32 *confBase, CautionReason reason,
+                          const struct FDR_Runtime_Args *a, const u8 *ptr,
+                          hwlmcb_rv_t *control, u32 *last_match) {
+    do  {
+        u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
+        u32 byte = bit / bucket + offset;
+        u32 idx  = bit % bucket;
+        u32 cf = confBase[idx];
+        if (!cf) {
+            continue;
+        }
+        const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
+                                        ((const u8 *)confBase + cf);
+        if (!(fdrc->groups & *control)) {
+            continue;
+        }
+        u64a tmp = 0;
+        u64a confVal = getConfVal(a, ptr, byte, reason);
+        confWithBit(fdrc, a, ptr - a->buf + byte, control,
+                    last_match, confVal, &tmp, 0);
+    } while (unlikely(*conf));
+}
+
+static really_inline
+const m128 *getMaskBase(const struct Teddy *teddy) {
+    return (const m128 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
+}
+
+static really_inline
+const u64a *getReinforcedMaskBase(const struct Teddy *teddy, u8 numMask) {
+    return (const u64a *)((const u8 *)getMaskBase(teddy)
+                          + ROUNDUP_CL(2 * numMask * sizeof(m128)));
+}
+
+static really_inline
+const u32 *getConfBase(const struct Teddy *teddy) {
+    return (const u32 *)((const u8 *)teddy + teddy->confOffset);
+}
+
+#endif /* TEDDY_RUNTIME_COMMON_H_ */
diff --git a/regex/hs.h b/regex/hs.h
new file mode 100644
index 000000000..2fe5d248b
--- /dev/null
+++ b/regex/hs.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2015-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef HS_H_
+#define HS_H_
+
+/**
+ * @file
+ * @brief The complete Hyperscan API definition.
+ *
+ * Hyperscan is a high speed regular expression engine.
+ *
+ * This header includes both the Hyperscan compiler and runtime components. See
+ * the individual component headers for documentation.
+ */
+
+/* The current Hyperscan version information. */
+
+#define HS_MAJOR      5
+#define HS_MINOR      4
+#define HS_PATCH      0
+
+#include "hs_compile.h"
+#include "hs_runtime.h"
+
+#endif /* HS_H_ */
diff --git a/regex/hs_common.h b/regex/hs_common.h
new file mode 100644
index 000000000..8366d0018
--- /dev/null
+++ b/regex/hs_common.h
@@ -0,0 +1,600 @@
+/*
+ * Copyright (c) 2015-2019, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef HS_COMMON_H_
+#define HS_COMMON_H_
+
+#if defined(_WIN32)
+#define HS_CDECL    __cdecl
+#else
+#define HS_CDECL
+#endif
+#ifndef __KERNEL__
+#include <stdlib.h>
+#else
+#include <linux/types.h>
+#endif
+
+/**
+ * @file
+ * @brief The Hyperscan common API definition.
+ *
+ * Hyperscan is a high speed regular expression engine.
+ *
+ * This header contains functions available to both the Hyperscan compiler and
+ * runtime.
+ */
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+struct hs_database;
+
+/**
+ * A Hyperscan pattern database.
+ *
+ * Generated by one of the Hyperscan compiler functions:
+ *  - @ref hs_compile()
+ *  - @ref hs_compile_multi()
+ *  - @ref hs_compile_ext_multi()
+ */
+typedef struct hs_database hs_database_t;
+
+/**
+ * A type for errors returned by Hyperscan functions.
+ */
+typedef int hs_error_t;
+
+/**
+ * Free a compiled pattern database.
+ *
+ * The free callback set by @ref hs_set_database_allocator() (or @ref
+ * hs_set_allocator()) will be used by this function.
+ *
+ * @param db
+ *      A compiled pattern database. NULL may also be safely provided, in which
+ *      case the function does nothing.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t HS_CDECL hs_free_database(hs_database_t *db);
+
+/**
+ * Serialize a pattern database to a stream of bytes.
+ *
+ * The allocator callback set by @ref hs_set_misc_allocator() (or @ref
+ * hs_set_allocator()) will be used by this function.
+ *
+ * @param db
+ *      A compiled pattern database.
+ *
+ * @param bytes
+ *      On success, a pointer to an array of bytes will be returned here.
+ *      These bytes can be subsequently relocated or written to disk. The
+ *      caller is responsible for freeing this block.
+ *
+ * @param length
+ *      On success, the number of bytes in the generated byte array will be
+ *      returned here.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, @ref HS_NOMEM if the byte array cannot be
+ *      allocated, other values may be returned if errors are detected.
+ */
+hs_error_t HS_CDECL hs_serialize_database(const hs_database_t *db, char **bytes,
+                                          size_t *length);
+
+/**
+ * Reconstruct a pattern database from a stream of bytes previously generated
+ * by @ref hs_serialize_database().
+ *
+ * This function will allocate sufficient space for the database using the
+ * allocator set with @ref hs_set_database_allocator() (or @ref
+ * hs_set_allocator()); to use a pre-allocated region of memory, use the @ref
+ * hs_deserialize_database_at() function.
+ *
+ * @param bytes
+ *      A byte array generated by @ref hs_serialize_database() representing a
+ *      compiled pattern database.
+ *
+ * @param length
+ *      The length of the byte array generated by @ref hs_serialize_database().
+ *      This should be the same value as that returned by @ref
+ *      hs_serialize_database().
+ *
+ * @param db
+ *      On success, a pointer to a newly allocated @ref hs_database_t will be
+ *      returned here. This database can then be used for scanning, and
+ *      eventually freed by the caller using @ref hs_free_database().
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t HS_CDECL hs_deserialize_database(const char *bytes,
+                                            const size_t length,
+                                            hs_database_t **db);
+
+/**
+ * Reconstruct a pattern database from a stream of bytes previously generated
+ * by @ref hs_serialize_database() at a given memory location.
+ *
+ * This function (unlike @ref hs_deserialize_database()) will write the
+ * reconstructed database to the memory location given in the @p db parameter.
+ * The amount of space required at this location can be determined with the
+ * @ref hs_serialized_database_size() function.
+ *
+ * @param bytes
+ *      A byte array generated by @ref hs_serialize_database() representing a
+ *      compiled pattern database.
+ *
+ * @param length
+ *      The length of the byte array generated by @ref hs_serialize_database().
+ *      This should be the same value as that returned by @ref
+ *      hs_serialize_database().
+ *
+ * @param db
+ *      Pointer to an 8-byte aligned block of memory of sufficient size to hold
+ *      the deserialized database. On success, the reconstructed database will
+ *      be written to this location. This database can then be used for pattern
+ *      matching. The user is responsible for freeing this memory; the @ref
+ *      hs_free_database() call should not be used.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t HS_CDECL hs_deserialize_database_at(const char *bytes,
+                                               const size_t length,
+                                               hs_database_t *db);
+
+/**
+ * Provides the size of the stream state allocated by a single stream opened
+ * against the given database.
+ *
+ * @param database
+ *      Pointer to a compiled (streaming mode) pattern database.
+ *
+ * @param stream_size
+ *      On success, the size in bytes of an individual stream opened against the
+ *      given database is placed in this parameter.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t HS_CDECL hs_stream_size(const hs_database_t *database,
+                                   size_t *stream_size);
+
+/**
+ * Provides the size of the given database in bytes.
+ *
+ * @param database
+ *      Pointer to compiled pattern database.
+ *
+ * @param database_size
+ *      On success, the size of the compiled database in bytes is placed in this
+ *      parameter.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t HS_CDECL hs_database_size(const hs_database_t *database,
+                                     size_t *database_size);
+
+/**
+ * Utility function for reporting the size that would be required by a
+ * database if it were deserialized.
+ *
+ * This can be used to allocate a shared memory region or other "special"
+ * allocation prior to deserializing with the @ref hs_deserialize_database_at()
+ * function.
+ *
+ * @param bytes
+ *      Pointer to a byte array generated by @ref hs_serialize_database()
+ *      representing a compiled pattern database.
+ *
+ * @param length
+ *      The length of the byte array generated by @ref hs_serialize_database().
+ *      This should be the same value as that returned by @ref
+ *      hs_serialize_database().
+ *
+ * @param deserialized_size
+ *      On success, the size of the compiled database that would be generated
+ *      by @ref hs_deserialize_database_at() is returned here.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t HS_CDECL hs_serialized_database_size(const char *bytes,
+                                                const size_t length,
+                                                size_t *deserialized_size);
+
+/**
+ * Utility function providing information about a database.
+ *
+ * @param database
+ *      Pointer to a compiled database.
+ *
+ * @param info
+ *      On success, a string containing the version and platform information for
+ *      the supplied database is placed in the parameter. The string is
+ *      allocated using the allocator supplied in @ref hs_set_misc_allocator()
+ *      (or malloc() if no allocator was set) and should be freed by the caller.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t HS_CDECL hs_database_info(const hs_database_t *database,
+                                     char **info);
+
+/**
+ * Utility function providing information about a serialized database.
+ *
+ * @param bytes
+ *      Pointer to a serialized database.
+ *
+ * @param length
+ *      Length in bytes of the serialized database.
+ *
+ * @param info
+ *      On success, a string containing the version and platform information
+ *      for the supplied serialized database is placed in the parameter. The
+ *      string is allocated using the allocator supplied in @ref
+ *      hs_set_misc_allocator() (or malloc() if no allocator was set) and
+ *      should be freed by the caller.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t HS_CDECL hs_serialized_database_info(const char *bytes,
+                                                size_t length, char **info);
+
+/**
+ * The type of the callback function that will be used by Hyperscan to allocate
+ * more memory at runtime as required, for example in @ref hs_open_stream() to
+ * allocate stream state.
+ *
+ * If Hyperscan is to be used in a multi-threaded, or similarly concurrent
+ * environment, the allocation function will need to be re-entrant, or
+ * similarly safe for concurrent use.
+ *
+ * @param size
+ *      The number of bytes to allocate.
+ * @return
+ *      A pointer to the region of memory allocated, or NULL on error.
+ */
+typedef void *(HS_CDECL *hs_alloc_t)(size_t size);
+
+/**
+ * The type of the callback function that will be used by Hyperscan to free
+ * memory regions previously allocated using the @ref hs_alloc_t function.
+ *
+ * @param ptr
+ *      The region of memory to be freed.
+ */
+typedef void (HS_CDECL *hs_free_t)(void *ptr);
+
+/**
+ * Set the allocate and free functions used by Hyperscan for allocating
+ * memory at runtime for stream state, scratch space, database bytecode,
+ * and various other data structure returned by the Hyperscan API.
+ *
+ * The function is equivalent to calling @ref hs_set_stream_allocator(),
+ * @ref hs_set_scratch_allocator(), @ref hs_set_database_allocator() and
+ * @ref hs_set_misc_allocator() with the provided parameters.
+ *
+ * This call will override any previous allocators that have been set.
+ *
+ * Note: there is no way to change the allocator used for temporary objects
+ * created during the various compile calls (@ref hs_compile(), @ref
+ * hs_compile_multi(), @ref hs_compile_ext_multi()).
+ *
+ * @param alloc_func
+ *      A callback function pointer that allocates memory. This function must
+ *      return memory suitably aligned for the largest representable data type
+ *      on this platform.
+ *
+ * @param free_func
+ *      A callback function pointer that frees allocated memory.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t HS_CDECL hs_set_allocator(hs_alloc_t alloc_func,
+                                     hs_free_t free_func);
+
+/**
+ * Set the allocate and free functions used by Hyperscan for allocating memory
+ * for database bytecode produced by the compile calls (@ref hs_compile(), @ref
+ * hs_compile_multi(), @ref hs_compile_ext_multi()) and by database
+ * deserialization (@ref hs_deserialize_database()).
+ *
+ * If no database allocation functions are set, or if NULL is used in place of
+ * both parameters, then memory allocation will default to standard methods
+ * (such as the system malloc() and free() calls).
+ *
+ * This call will override any previous database allocators that have been set.
+ *
+ * Note: the database allocator may also be set by calling @ref
+ * hs_set_allocator().
+ *
+ * Note: there is no way to change how temporary objects created during the
+ * various compile calls (@ref hs_compile(), @ref hs_compile_multi(), @ref
+ * hs_compile_ext_multi()) are allocated.
+ *
+ * @param alloc_func
+ *      A callback function pointer that allocates memory. This function must
+ *      return memory suitably aligned for the largest representable data type
+ *      on this platform.
+ *
+ * @param free_func
+ *      A callback function pointer that frees allocated memory.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t HS_CDECL hs_set_database_allocator(hs_alloc_t alloc_func,
+                                              hs_free_t free_func);
+
+/**
+ * Set the allocate and free functions used by Hyperscan for allocating memory
+ * for items returned by the Hyperscan API such as @ref hs_compile_error_t, @ref
+ * hs_expr_info_t and serialized databases.
+ *
+ * If no misc allocation functions are set, or if NULL is used in place of both
+ * parameters, then memory allocation will default to standard methods (such as
+ * the system malloc() and free() calls).
+ *
+ * This call will override any previous misc allocators that have been set.
+ *
+ * Note: the misc allocator may also be set by calling @ref hs_set_allocator().
+ *
+ * @param alloc_func
+ *      A callback function pointer that allocates memory. This function must
+ *      return memory suitably aligned for the largest representable data type
+ *      on this platform.
+ *
+ * @param free_func
+ *      A callback function pointer that frees allocated memory.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t HS_CDECL hs_set_misc_allocator(hs_alloc_t alloc_func,
+                                          hs_free_t free_func);
+
+/**
+ * Set the allocate and free functions used by Hyperscan for allocating memory
+ * for scratch space by @ref hs_alloc_scratch() and @ref hs_clone_scratch().
+ *
+ * If no scratch allocation functions are set, or if NULL is used in place of
+ * both parameters, then memory allocation will default to standard methods
+ * (such as the system malloc() and free() calls).
+ *
+ * This call will override any previous scratch allocators that have been set.
+ *
+ * Note: the scratch allocator may also be set by calling @ref
+ * hs_set_allocator().
+ *
+ * @param alloc_func
+ *      A callback function pointer that allocates memory. This function must
+ *      return memory suitably aligned for the largest representable data type
+ *      on this platform.
+ *
+ * @param free_func
+ *      A callback function pointer that frees allocated memory.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t HS_CDECL hs_set_scratch_allocator(hs_alloc_t alloc_func,
+                                             hs_free_t free_func);
+
+/**
+ * Set the allocate and free functions used by Hyperscan for allocating memory
+ * for stream state by @ref hs_open_stream().
+ *
+ * If no stream allocation functions are set, or if NULL is used in place of
+ * both parameters, then memory allocation will default to standard methods
+ * (such as the system malloc() and free() calls).
+ *
+ * This call will override any previous stream allocators that have been set.
+ *
+ * Note: the stream allocator may also be set by calling @ref
+ * hs_set_allocator().
+ *
+ * @param alloc_func
+ *      A callback function pointer that allocates memory. This function must
+ *      return memory suitably aligned for the largest representable data type
+ *      on this platform.
+ *
+ * @param free_func
+ *      A callback function pointer that frees allocated memory.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t HS_CDECL hs_set_stream_allocator(hs_alloc_t alloc_func,
+                                            hs_free_t free_func);
+
+/**
+ * Utility function for identifying this release version.
+ *
+ * @return
+ *      A string containing the version number of this release build and the
+ *      date of the build. It is allocated statically, so it does not need to
+ *      be freed by the caller.
+ */
+const char * HS_CDECL hs_version(void);
+
+/**
+ * Utility function to test the current system architecture.
+ *
+ * Hyperscan requires the Supplemental Streaming SIMD Extensions 3 instruction
+ * set. This function can be called on any x86 platform to determine if the
+ * system provides the required instruction set.
+ *
+ * This function does not test for more advanced features if Hyperscan has
+ * been built for a more specific architecture, for example the AVX2
+ * instruction set.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, @ref HS_ARCH_ERROR if system does not
+ *      support Hyperscan.
+ */
+hs_error_t HS_CDECL hs_valid_platform(void);
+
+/**
+ * @defgroup HS_ERROR hs_error_t values
+ *
+ * @{
+ */
+
+/**
+ * The engine completed normally.
+ */
+#define HS_SUCCESS              0
+
+/**
+ * A parameter passed to this function was invalid.
+ *
+ * This error is only returned in cases where the function can detect an
+ * invalid parameter -- it cannot be relied upon to detect (for example)
+ * pointers to freed memory or other invalid data.
+ */
+#define HS_INVALID              (-1)
+
+/**
+ * A memory allocation failed.
+ */
+#define HS_NOMEM                (-2)
+
+/**
+ * The engine was terminated by callback.
+ *
+ * This return value indicates that the target buffer was partially scanned,
+ * but that the callback function requested that scanning cease after a match
+ * was located.
+ */
+#define HS_SCAN_TERMINATED      (-3)
+
+/**
+ * The pattern compiler failed, and the @ref hs_compile_error_t should be
+ * inspected for more detail.
+ */
+#define HS_COMPILER_ERROR       (-4)
+
+/**
+ * The given database was built for a different version of Hyperscan.
+ */
+#define HS_DB_VERSION_ERROR     (-5)
+
+/**
+ * The given database was built for a different platform (i.e., CPU type).
+ */
+#define HS_DB_PLATFORM_ERROR    (-6)
+
+/**
+ * The given database was built for a different mode of operation. This error
+ * is returned when streaming calls are used with a block or vectored database
+ * and vice versa.
+ */
+#define HS_DB_MODE_ERROR        (-7)
+
+/**
+ * A parameter passed to this function was not correctly aligned.
+ */
+#define HS_BAD_ALIGN            (-8)
+
+/**
+ * The memory allocator (either malloc() or the allocator set with @ref
+ * hs_set_allocator()) did not correctly return memory suitably aligned for the
+ * largest representable data type on this platform.
+ */
+#define HS_BAD_ALLOC            (-9)
+
+/**
+ * The scratch region was already in use.
+ *
+ * This error is returned when Hyperscan is able to detect that the scratch
+ * region given is already in use by another Hyperscan API call.
+ *
+ * A separate scratch region, allocated with @ref hs_alloc_scratch() or @ref
+ * hs_clone_scratch(), is required for every concurrent caller of the Hyperscan
+ * API.
+ *
+ * For example, this error might be returned when @ref hs_scan() has been
+ * called inside a callback delivered by a currently-executing @ref hs_scan()
+ * call using the same scratch region.
+ *
+ * Note: Not all concurrent uses of scratch regions may be detected. This error
+ * is intended as a best-effort debugging tool, not a guarantee.
+ */
+#define HS_SCRATCH_IN_USE       (-10)
+
+/**
+ * Unsupported CPU architecture.
+ *
+ * This error is returned when Hyperscan is able to detect that the current
+ * system does not support the required instruction set.
+ *
+ * At a minimum, Hyperscan requires Supplemental Streaming SIMD Extensions 3
+ * (SSSE3).
+ */
+#define HS_ARCH_ERROR           (-11)
+
+/**
+ * Provided buffer was too small.
+ *
+ * This error indicates that there was insufficient space in the buffer. The
+ * call should be repeated with a larger provided buffer.
+ *
+ * Note: in this situation, it is normal for the amount of space required to be
+ * returned in the same manner as the used space would have been returned if the
+ * call was successful.
+ */
+#define HS_INSUFFICIENT_SPACE   (-12)
+
+/**
+ * Unexpected internal error.
+ *
+ * This error indicates that there was unexpected matching behaviors. This
+ * could be related to invalid usage of stream and scratch space or invalid memory
+ * operations by users.
+ *
+ */
+#define HS_UNKNOWN_ERROR   (-13)
+
+/** @} */
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* HS_COMMON_H_ */
diff --git a/regex/hs_compile.h b/regex/hs_compile.h
new file mode 100644
index 000000000..b318c29db
--- /dev/null
+++ b/regex/hs_compile.h
@@ -0,0 +1,1224 @@
+/*
+ * Copyright (c) 2015-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef HS_COMPILE_H_
+#define HS_COMPILE_H_
+
+/**
+ * @file
+ * @brief The Hyperscan compiler API definition.
+ *
+ * Hyperscan is a high speed regular expression engine.
+ *
+ * This header contains functions for compiling regular expressions into
+ * Hyperscan databases that can be used by the Hyperscan runtime.
+ */
+
+#include "hs_common.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/**
+ * A type containing error details that is returned by the compile calls (@ref
+ * hs_compile(), @ref hs_compile_multi() and @ref hs_compile_ext_multi()) on
+ * failure. The caller may inspect the values returned in this type to
+ * determine the cause of failure.
+ *
+ * Common errors generated during the compile process include:
+ *
+ *    - *Invalid parameter*
+ *
+ *      An invalid argument was specified in the compile call.
+ *
+ *    - *Unrecognised flag*
+ *
+ *      An unrecognised value was passed in the flags argument.
+ *
+ *    - *Pattern matches empty buffer*
+ *
+ *      By default, Hyperscan only supports patterns that will *always*
+ *      consume at least one byte of input. Patterns that do not have this
+ *      property (such as `/(abc)?/`) will produce this error unless
+ *      the @ref HS_FLAG_ALLOWEMPTY flag is supplied. Note that such
+ *      patterns will produce a match for *every* byte when scanned.
+ *
+ *    - *Embedded anchors not supported*
+ *
+ *      Hyperscan only supports the use of anchor meta-characters (such as
+ *      `^` and `$`) in patterns where they could *only* match
+ *      at the start or end of a buffer. A pattern containing an embedded
+ *      anchor, such as `/abc^def/`, can never match, as there is no
+ *      way for `abc` to precede the start of the data stream.
+ *
+ *    - *Bounded repeat is too large*
+ *
+ *      The pattern contains a repeated construct with very large finite
+ *      bounds.
+ *
+ *    - *Unsupported component type*
+ *
+ *      An unsupported PCRE construct was used in the pattern.
+ *
+ *    - *Unable to generate bytecode*
+ *
+ *      This error indicates that Hyperscan was unable to compile a pattern
+ *      that is syntactically valid. The most common cause is a pattern that is
+ *      very long and complex or contains a large repeated subpattern.
+ *
+ *    - *Unable to allocate memory*
+ *
+ *      The library was unable to allocate temporary storage used during
+ *      compilation time.
+ *
+ *    - *Allocator returned misaligned memory*
+ *
+ *      The memory allocator (either malloc() or the allocator set with @ref
+ *      hs_set_allocator()) did not correctly return memory suitably aligned
+ *      for the largest representable data type on this platform.
+ *
+ *    - *Internal error*
+ *
+ *      An unexpected error occurred: if this error is reported, please contact
+ *      the Hyperscan team with a description of the situation.
+ */
+typedef struct hs_compile_error {
+    /**
+     * A human-readable error message describing the error.
+     */
+    char *message;
+
+    /**
+     * The zero-based number of the expression that caused the error (if this
+     * can be determined). If the error is not specific to an expression, then
+     * this value will be less than zero.
+     */
+    int expression;
+} hs_compile_error_t;
+
+/**
+ * A type containing information on the target platform which may optionally be
+ * provided to the compile calls (@ref hs_compile(), @ref hs_compile_multi(),
+ * @ref hs_compile_ext_multi()).
+ *
+ * A hs_platform_info structure may be populated for the current platform by
+ * using the @ref hs_populate_platform() call.
+ */
+typedef struct hs_platform_info {
+    /**
+     * Information about the target platform which may be used to guide the
+     * optimisation process of the compile.
+     *
+     * Use of this field does not limit the processors that the resulting
+     * database can run on, but may impact the performance of the resulting
+     * database.
+     */
+    unsigned int tune;
+
+    /**
+     * Relevant CPU features available on the target platform
+     *
+     * This value may be produced by combining HS_CPU_FEATURE_* flags (such as
+     * @ref HS_CPU_FEATURES_AVX2). Multiple CPU features may be or'ed together
+     * to produce the value.
+     */
+    unsigned long long cpu_features;
+
+    /**
+     * Reserved for future use.
+     */
+    unsigned long long reserved1;
+
+    /**
+     * Reserved for future use.
+     */
+    unsigned long long reserved2;
+} hs_platform_info_t;
+
+/**
+ * A type containing information related to an expression that is returned by
+ * @ref hs_expression_info() or @ref hs_expression_ext_info.
+ */
+typedef struct hs_expr_info {
+    /**
+     * The minimum length in bytes of a match for the pattern.
+     *
+     * Note: in some cases when using advanced features to suppress matches
+     * (such as extended parameters or the @ref HS_FLAG_SINGLEMATCH flag) this
+     * may represent a conservative lower bound for the true minimum length of
+     * a match.
+     */
+    unsigned int min_width;
+
+    /**
+     * The maximum length in bytes of a match for the pattern. If the pattern
+     * has an unbounded maximum length, this will be set to the maximum value
+     * of an unsigned int (UINT_MAX).
+     *
+     * Note: in some cases when using advanced features to suppress matches
+     * (such as extended parameters or the @ref HS_FLAG_SINGLEMATCH flag) this
+     * may represent a conservative upper bound for the true maximum length of
+     * a match.
+     */
+    unsigned int max_width;
+
+    /**
+     * Whether this expression can produce matches that are not returned in
+     * order, such as those produced by assertions. Zero if false, non-zero if
+     * true.
+     */
+    char unordered_matches;
+
+    /**
+     * Whether this expression can produce matches at end of data (EOD). In
+     * streaming mode, EOD matches are raised during @ref hs_close_stream(),
+     * since it is only when @ref hs_close_stream() is called that the EOD
+     * location is known. Zero if false, non-zero if true.
+     *
+     * Note: trailing `\b` word boundary assertions may also result in EOD
+     * matches as end-of-data can act as a word boundary.
+     */
+    char matches_at_eod;
+
+    /**
+     * Whether this expression can *only* produce matches at end of data (EOD).
+     * In streaming mode, all matches for this expression are raised during
+     * @ref hs_close_stream(). Zero if false, non-zero if true.
+     */
+    char matches_only_at_eod;
+} hs_expr_info_t;
+
+/**
+ * A structure containing additional parameters related to an expression,
+ * passed in at build time to @ref hs_compile_ext_multi() or @ref
+ * hs_expression_ext_info.
+ *
+ * These parameters allow the set of matches produced by a pattern to be
+ * constrained at compile time, rather than relying on the application to
+ * process unwanted matches at runtime.
+ */
+typedef struct hs_expr_ext {
+    /**
+     * Flags governing which parts of this structure are to be used by the
+     * compiler. See @ref HS_EXT_FLAG.
+     */
+    unsigned long long flags;
+
+    /**
+     * The minimum end offset in the data stream at which this expression
+     * should match successfully. To use this parameter, set the
+     * @ref HS_EXT_FLAG_MIN_OFFSET flag in the hs_expr_ext::flags field.
+     */
+    unsigned long long min_offset;
+
+    /**
+     * The maximum end offset in the data stream at which this expression
+     * should match successfully. To use this parameter, set the
+     * @ref HS_EXT_FLAG_MAX_OFFSET flag in the hs_expr_ext::flags field.
+     */
+    unsigned long long max_offset;
+
+    /**
+     * The minimum match length (from start to end) required to successfully
+     * match this expression. To use this parameter, set the
+     * @ref HS_EXT_FLAG_MIN_LENGTH flag in the hs_expr_ext::flags field.
+     */
+    unsigned long long min_length;
+
+    /**
+     * Allow patterns to approximately match within this edit distance. To use
+     * this parameter, set the @ref HS_EXT_FLAG_EDIT_DISTANCE flag in the
+     * hs_expr_ext::flags field.
+     */
+    unsigned edit_distance;
+
+    /**
+     * Allow patterns to approximately match within this Hamming distance. To
+     * use this parameter, set the @ref HS_EXT_FLAG_HAMMING_DISTANCE flag in the
+     * hs_expr_ext::flags field.
+     */
+    unsigned hamming_distance;
+} hs_expr_ext_t;
+
+/**
+ * @defgroup HS_EXT_FLAG hs_expr_ext_t flags
+ *
+ * These flags are used in @ref hs_expr_ext_t::flags to indicate which fields
+ * are used.
+ *
+ * @{
+ */
+
+/** Flag indicating that the hs_expr_ext::min_offset field is used. */
+#define HS_EXT_FLAG_MIN_OFFSET      1ULL
+
+/** Flag indicating that the hs_expr_ext::max_offset field is used. */
+#define HS_EXT_FLAG_MAX_OFFSET      2ULL
+
+/** Flag indicating that the hs_expr_ext::min_length field is used. */
+#define HS_EXT_FLAG_MIN_LENGTH      4ULL
+
+/** Flag indicating that the hs_expr_ext::edit_distance field is used. */
+#define HS_EXT_FLAG_EDIT_DISTANCE   8ULL
+
+/** Flag indicating that the hs_expr_ext::hamming_distance field is used. */
+#define HS_EXT_FLAG_HAMMING_DISTANCE 16ULL
+
+/** @} */
+
+/**
+ * The basic regular expression compiler.
+ *
+ * This is the function call with which an expression is compiled into a
+ * Hyperscan database which can be passed to the runtime functions (such as
+ * @ref hs_scan(), @ref hs_open_stream(), etc.)
+ *
+ * @param expression
+ *      The NULL-terminated expression to parse. Note that this string must
+ *      represent ONLY the pattern to be matched, with no delimiters or flags;
+ *      any global flags should be specified with the @p flags argument. For
+ *      example, the expression `/abc?def/i` should be compiled by providing
+ *      `abc?def` as the @p expression, and @ref HS_FLAG_CASELESS as the @a
+ *      flags.
+ *
+ * @param flags
+ *      Flags which modify the behaviour of the expression. Multiple flags may
+ *      be used by ORing them together. Valid values are:
+ *       - HS_FLAG_CASELESS - Matching will be performed case-insensitively.
+ *       - HS_FLAG_DOTALL - Matching a `.` will not exclude newlines.
+ *       - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
+ *       - HS_FLAG_SINGLEMATCH - Only one match will be generated for the
+ *                               expression per stream.
+ *       - HS_FLAG_ALLOWEMPTY - Allow expressions which can match against an
+ *                              empty string, such as `.*`.
+ *       - HS_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
+ *       - HS_FLAG_UCP - Use Unicode properties for character classes.
+ *       - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
+ *       - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
+ *                                when a match is found.
+ *       - HS_FLAG_COMBINATION - Parse the expression in logical combination
+ *                               syntax.
+ *       - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for
+ *                         the sub-expressions in logical combinations.
+ *
+ * @param mode
+ *      Compiler mode flags that affect the database as a whole. One of @ref
+ *      HS_MODE_STREAM or @ref HS_MODE_BLOCK or @ref HS_MODE_VECTORED must be
+ *      supplied, to select between the generation of a streaming, block or
+ *      vectored database. In addition, other flags (beginning with HS_MODE_)
+ *      may be supplied to enable specific features. See @ref HS_MODE_FLAG for
+ *      more details.
+ *
+ * @param platform
+ *      If not NULL, the platform structure is used to determine the target
+ *      platform for the database. If NULL, a database suitable for running
+ *      on the current host platform is produced.
+ *
+ * @param db
+ *      On success, a pointer to the generated database will be returned in
+ *      this parameter, or NULL on failure. The caller is responsible for
+ *      deallocating the buffer using the @ref hs_free_database() function.
+ *
+ * @param error
+ *      If the compile fails, a pointer to a @ref hs_compile_error_t will be
+ *      returned, providing details of the error condition. The caller is
+ *      responsible for deallocating the buffer using the @ref
+ *      hs_free_compile_error() function.
+ *
+ * @return
+ *      @ref HS_SUCCESS is returned on successful compilation; @ref
+ *      HS_COMPILER_ERROR on failure, with details provided in the error
+ *      parameter.
+ */
+hs_error_t HS_CDECL hs_compile(const char *expression, unsigned int flags,
+                               unsigned int mode,
+                               const hs_platform_info_t *platform,
+                               hs_database_t **db, hs_compile_error_t **error);
+
+/**
+ * The multiple regular expression compiler.
+ *
+ * This is the function call with which a set of expressions is compiled into a
+ * database which can be passed to the runtime functions (such as @ref
+ * hs_scan(), @ref hs_open_stream(), etc.) Each expression can be labelled with
+ * a unique integer which is passed into the match callback to identify the
+ * pattern that has matched.
+ *
+ * @param expressions
+ *      Array of NULL-terminated expressions to compile. Note that (as for @ref
+ *      hs_compile()) these strings must contain only the pattern to be
+ *      matched, with no delimiters or flags. For example, the expression
+ *      `/abc?def/i` should be compiled by providing `abc?def` as the first
+ *      string in the @p expressions array, and @ref HS_FLAG_CASELESS as the
+ *      first value in the @p flags array.
+ *
+ * @param flags
+ *      Array of flags which modify the behaviour of each expression. Multiple
+ *      flags may be used by ORing them together.  Specifying the NULL pointer
+ *      in place of an array will set the flags value for all patterns to zero.
+ *      Valid values are:
+ *       - HS_FLAG_CASELESS - Matching will be performed case-insensitively.
+ *       - HS_FLAG_DOTALL - Matching a `.` will not exclude newlines.
+ *       - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
+ *       - HS_FLAG_SINGLEMATCH - Only one match will be generated by patterns
+ *                               with this match id per stream.
+ *       - HS_FLAG_ALLOWEMPTY - Allow expressions which can match against an
+ *                              empty string, such as `.*`.
+ *       - HS_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
+ *       - HS_FLAG_UCP - Use Unicode properties for character classes.
+ *       - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
+ *       - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
+ *                                when a match is found.
+ *       - HS_FLAG_COMBINATION - Parse the expression in logical combination
+ *                               syntax.
+ *       - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for
+ *                         the sub-expressions in logical combinations.
+ *
+ * @param ids
+ *      An array of integers specifying the ID number to be associated with the
+ *      corresponding pattern in the expressions array. Specifying the NULL
+ *      pointer in place of an array will set the ID value for all patterns to
+ *      zero.
+ *
+ * @param elements
+ *      The number of elements in the input arrays.
+ *
+ * @param mode
+ *      Compiler mode flags that affect the database as a whole. One of @ref
+ *      HS_MODE_STREAM or @ref HS_MODE_BLOCK or @ref HS_MODE_VECTORED must be
+ *      supplied, to select between the generation of a streaming, block or
+ *      vectored database. In addition, other flags (beginning with HS_MODE_)
+ *      may be supplied to enable specific features. See @ref HS_MODE_FLAG for
+ *      more details.
+ *
+ * @param platform
+ *      If not NULL, the platform structure is used to determine the target
+ *      platform for the database. If NULL, a database suitable for running
+ *      on the current host platform is produced.
+ *
+ * @param db
+ *      On success, a pointer to the generated database will be returned in
+ *      this parameter, or NULL on failure. The caller is responsible for
+ *      deallocating the buffer using the @ref hs_free_database() function.
+ *
+ * @param error
+ *      If the compile fails, a pointer to a @ref hs_compile_error_t will be
+ *      returned, providing details of the error condition. The caller is
+ *      responsible for deallocating the buffer using the @ref
+ *      hs_free_compile_error() function.
+ *
+ * @return
+ *      @ref HS_SUCCESS is returned on successful compilation; @ref
+ *      HS_COMPILER_ERROR on failure, with details provided in the @p error
+ *      parameter.
+ *
+ */
+hs_error_t HS_CDECL hs_compile_multi(const char *const *expressions,
+                                     const unsigned int *flags,
+                                     const unsigned int *ids,
+                                     unsigned int elements, unsigned int mode,
+                                     const hs_platform_info_t *platform,
+                                     hs_database_t **db,
+                                     hs_compile_error_t **error);
+
+/**
+ * The multiple regular expression compiler with extended parameter support.
+ *
+ * This function call compiles a group of expressions into a database in the
+ * same way as @ref hs_compile_multi(), but allows additional parameters to be
+ * specified via an @ref hs_expr_ext_t structure per expression.
+ *
+ * @param expressions
+ *      Array of NULL-terminated expressions to compile. Note that (as for @ref
+ *      hs_compile()) these strings must contain only the pattern to be
+ *      matched, with no delimiters or flags. For example, the expression
+ *      `/abc?def/i` should be compiled by providing `abc?def` as the first
+ *      string in the @p expressions array, and @ref HS_FLAG_CASELESS as the
+ *      first value in the @p flags array.
+ *
+ * @param flags
+ *      Array of flags which modify the behaviour of each expression. Multiple
+ *      flags may be used by ORing them together. Specifying the NULL pointer
+ *      in place of an array will set the flags value for all patterns to zero.
+ *      Valid values are:
+ *       - HS_FLAG_CASELESS - Matching will be performed case-insensitively.
+ *       - HS_FLAG_DOTALL - Matching a `.` will not exclude newlines.
+ *       - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
+ *       - HS_FLAG_SINGLEMATCH - Only one match will be generated by patterns
+ *                               with this match id per stream.
+ *       - HS_FLAG_ALLOWEMPTY - Allow expressions which can match against an
+ *                              empty string, such as `.*`.
+ *       - HS_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
+ *       - HS_FLAG_UCP - Use Unicode properties for character classes.
+ *       - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
+ *       - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
+ *                                when a match is found.
+ *       - HS_FLAG_COMBINATION - Parse the expression in logical combination
+ *                               syntax.
+ *       - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for
+ *                         the sub-expressions in logical combinations.
+ *
+ * @param ids
+ *      An array of integers specifying the ID number to be associated with the
+ *      corresponding pattern in the expressions array. Specifying the NULL
+ *      pointer in place of an array will set the ID value for all patterns to
+ *      zero.
+ *
+ * @param ext
+ *      An array of pointers to filled @ref hs_expr_ext_t structures that
+ *      define extended behaviour for each pattern. NULL may be specified if no
+ *      extended behaviour is needed for an individual pattern, or in place of
+ *      the whole array if it is not needed for any expressions. Memory used by
+ *      these structures must be both allocated and freed by the caller.
+ *
+ * @param elements
+ *      The number of elements in the input arrays.
+ *
+ * @param mode
+ *      Compiler mode flags that affect the database as a whole. One of @ref
+ *      HS_MODE_STREAM, @ref HS_MODE_BLOCK or @ref HS_MODE_VECTORED must be
+ *      supplied, to select between the generation of a streaming, block or
+ *      vectored database. In addition, other flags (beginning with HS_MODE_)
+ *      may be supplied to enable specific features. See @ref HS_MODE_FLAG for
+ *      more details.
+ *
+ * @param platform
+ *      If not NULL, the platform structure is used to determine the target
+ *      platform for the database. If NULL, a database suitable for running
+ *      on the current host platform is produced.
+ *
+ * @param db
+ *      On success, a pointer to the generated database will be returned in
+ *      this parameter, or NULL on failure. The caller is responsible for
+ *      deallocating the buffer using the @ref hs_free_database() function.
+ *
+ * @param error
+ *      If the compile fails, a pointer to a @ref hs_compile_error_t will be
+ *      returned, providing details of the error condition. The caller is
+ *      responsible for deallocating the buffer using the @ref
+ *      hs_free_compile_error() function.
+ *
+ * @return
+ *      @ref HS_SUCCESS is returned on successful compilation; @ref
+ *      HS_COMPILER_ERROR on failure, with details provided in the @p error
+ *      parameter.
+ *
+ */
+hs_error_t HS_CDECL hs_compile_ext_multi(const char *const *expressions,
+                                const unsigned int *flags,
+                                const unsigned int *ids,
+                                const hs_expr_ext_t *const *ext,
+                                unsigned int elements, unsigned int mode,
+                                const hs_platform_info_t *platform,
+                                hs_database_t **db, hs_compile_error_t **error);
+
+/**
+ * The basic pure literal expression compiler.
+ *
+ * This is the function call with which a pure literal expression (not a
+ * common regular expression) is compiled into a Hyperscan database which
+ * can be passed to the runtime functions (such as @ref hs_scan(),
+ * @ref hs_open_stream(), etc.)
+ *
+ * @param expression
+ *      The NULL-terminated expression to parse. Note that this string must
+ *      represent ONLY the pattern to be matched, with no delimiters or flags;
+ *      any global flags should be specified with the @p flags argument. For
+ *      example, the expression `/abc?def/i` should be compiled by providing
+ *      `abc?def` as the @p expression, and @ref HS_FLAG_CASELESS as the @a
+ *      flags. Meanwhile, the string content shall be fully parsed in a literal
+ *      sense without any regular grammars. For example, the @p expression
+ *      `abc?` simply means a char sequence of `a`, `b`, `c`, and `?`. The `?`
+ *      here doesn't mean 0 or 1 quantifier under regular semantics.
+ *
+ * @param flags
+ *      Flags which modify the behaviour of the expression. Multiple flags may
+ *      be used by ORing them together. Compared to @ref hs_compile(), fewer
+ *      valid values are provided:
+ *       - HS_FLAG_CASELESS - Matching will be performed case-insensitively.
+ *       - HS_FLAG_SINGLEMATCH - Only one match will be generated for the
+ *                               expression per stream.
+ *       - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
+ *                                when a match is found.
+ *
+ * @param len
+ *      The length of the text content of the pure literal expression. As the
+ *      text content indicated by @p expression is treated as single character
+ *      one by one, the special terminating character `\0` should be allowed
+ *      to appear in expression, and not treated as a terminator for a string.
+ *      Thus, the end of a pure literal expression cannot be indicated by
+ *      identifying `\0`, but by counting to the expression length.
+ *
+ * @param mode
+ *      Compiler mode flags that affect the database as a whole. One of @ref
+ *      HS_MODE_STREAM or @ref HS_MODE_BLOCK or @ref HS_MODE_VECTORED must be
+ *      supplied, to select between the generation of a streaming, block or
+ *      vectored database. In addition, other flags (beginning with HS_MODE_)
+ *      may be supplied to enable specific features. See @ref HS_MODE_FLAG for
+ *      more details.
+ *
+ * @param platform
+ *      If not NULL, the platform structure is used to determine the target
+ *      platform for the database. If NULL, a database suitable for running
+ *      on the current host platform is produced.
+ *
+ * @param db
+ *      On success, a pointer to the generated database will be returned in
+ *      this parameter, or NULL on failure. The caller is responsible for
+ *      deallocating the buffer using the @ref hs_free_database() function.
+ *
+ * @param error
+ *      If the compile fails, a pointer to a @ref hs_compile_error_t will be
+ *      returned, providing details of the error condition. The caller is
+ *      responsible for deallocating the buffer using the @ref
+ *      hs_free_compile_error() function.
+ *
+ * @return
+ *      @ref HS_SUCCESS is returned on successful compilation; @ref
+ *      HS_COMPILER_ERROR on failure, with details provided in the error
+ *      parameter.
+ */
+hs_error_t HS_CDECL hs_compile_lit(const char *expression, unsigned flags,
+                                   const size_t len, unsigned mode,
+                                   const hs_platform_info_t *platform,
+                                   hs_database_t **db,
+                                   hs_compile_error_t **error);
+/**
+ * The multiple pure literal expression compiler.
+ *
+ * This is the function call with which a set of pure literal expressions is
+ * compiled into a database which can be passed to the runtime functions (such
+ * as @ref hs_scan(), @ref hs_open_stream(), etc.) Each expression can be
+ * labelled with a unique integer which is passed into the match callback to
+ * identify the pattern that has matched.
+ *
+ * @param expressions
+ *      The NULL-terminated expression to parse. Note that this string must
+ *      represent ONLY the pattern to be matched, with no delimiters or flags;
+ *      any global flags should be specified with the @p flags argument. For
+ *      example, the expression `/abc?def/i` should be compiled by providing
+ *      `abc?def` as the @p expression, and @ref HS_FLAG_CASELESS as the @a
+ *      flags. Meanwhile, the string content shall be fully parsed in a literal
+ *      sense without any regular grammars. For example, the @p expression
+ *      `abc?` simply means a char sequence of `a`, `b`, `c`, and `?`. The `?`
+ *      here doesn't mean 0 or 1 quantifier under regular semantics.
+ *
+ * @param flags
+ *      Array of flags which modify the behaviour of each expression. Multiple
+ *      flags may be used by ORing them together. Specifying the NULL pointer
+ *      in place of an array will set the flags value for all patterns to zero.
+ *      Compared to @ref hs_compile_multi(), fewer valid values are provided:
+ *       - HS_FLAG_CASELESS - Matching will be performed case-insensitively.
+ *       - HS_FLAG_SINGLEMATCH - Only one match will be generated for the
+ *                               expression per stream.
+ *       - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
+ *                                when a match is found.
+ *
+ * @param ids
+ *      An array of integers specifying the ID number to be associated with the
+ *      corresponding pattern in the expressions array. Specifying the NULL
+ *      pointer in place of an array will set the ID value for all patterns to
+ *      zero.
+ *
+ * @param lens
+ *      Array of lengths of the text content of each pure literal expression.
+ *      As the text content indicated by @p expression is treated as single
+ *      character one by one, the special terminating character `\0` should be
+ *      allowed to appear in expression, and not treated as a terminator for a
+ *      string. Thus, the end of a pure literal expression cannot be indicated
+ *      by identifying `\0`, but by counting to the expression length.
+ *
+ * @param elements
+ *      The number of elements in the input arrays.
+ *
+ * @param mode
+ *      Compiler mode flags that affect the database as a whole. One of @ref
+ *      HS_MODE_STREAM or @ref HS_MODE_BLOCK or @ref HS_MODE_VECTORED must be
+ *      supplied, to select between the generation of a streaming, block or
+ *      vectored database. In addition, other flags (beginning with HS_MODE_)
+ *      may be supplied to enable specific features. See @ref HS_MODE_FLAG for
+ *      more details.
+ *
+ * @param platform
+ *      If not NULL, the platform structure is used to determine the target
+ *      platform for the database. If NULL, a database suitable for running
+ *      on the current host platform is produced.
+ *
+ * @param db
+ *      On success, a pointer to the generated database will be returned in
+ *      this parameter, or NULL on failure. The caller is responsible for
+ *      deallocating the buffer using the @ref hs_free_database() function.
+ *
+ * @param error
+ *      If the compile fails, a pointer to a @ref hs_compile_error_t will be
+ *      returned, providing details of the error condition. The caller is
+ *      responsible for deallocating the buffer using the @ref
+ *      hs_free_compile_error() function.
+ *
+ * @return
+ *      @ref HS_SUCCESS is returned on successful compilation; @ref
+ *      HS_COMPILER_ERROR on failure, with details provided in the error
+ *      parameter.
+ */
+hs_error_t HS_CDECL hs_compile_lit_multi(const char * const *expressions,
+                                         const unsigned *flags,
+                                         const unsigned *ids,
+                                         const size_t *lens,
+                                         unsigned elements, unsigned mode,
+                                         const hs_platform_info_t *platform,
+                                         hs_database_t **db,
+                                         hs_compile_error_t **error);
+
+/**
+ * Free an error structure generated by @ref hs_compile(), @ref
+ * hs_compile_multi() or @ref hs_compile_ext_multi().
+ *
+ * @param error
+ *      The @ref hs_compile_error_t to be freed. NULL may also be safely
+ *      provided.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t HS_CDECL hs_free_compile_error(hs_compile_error_t *error);
+
+/**
+ * Utility function providing information about a regular expression. The
+ * information provided in @ref hs_expr_info_t includes the minimum and maximum
+ * width of a pattern match.
+ *
+ * Note: successful analysis of an expression with this function does not imply
+ * that compilation of the same expression (via @ref hs_compile(), @ref
+ * hs_compile_multi() or @ref hs_compile_ext_multi()) would succeed. This
+ * function may return @ref HS_SUCCESS for regular expressions that Hyperscan
+ * cannot compile.
+ *
+ * Note: some per-pattern flags (such as @ref HS_FLAG_ALLOWEMPTY, @ref
+ * HS_FLAG_SOM_LEFTMOST) are accepted by this call, but as they do not affect
+ * the properties returned in the @ref hs_expr_info_t structure, they will not
+ * affect the outcome of this function.
+ *
+ * @param expression
+ *      The NULL-terminated expression to parse. Note that this string must
+ *      represent ONLY the pattern to be matched, with no delimiters or flags;
+ *      any global flags should be specified with the @p flags argument.  For
+ *      example, the expression `/abc?def/i` should be compiled by providing
+ *      `abc?def` as the @p expression, and @ref HS_FLAG_CASELESS as the @a
+ *      flags.
+ *
+ * @param flags
+ *      Flags which modify the behaviour of the expression. Multiple flags may
+ *      be used by ORing them together. Valid values are:
+ *       - HS_FLAG_CASELESS - Matching will be performed case-insensitively.
+ *       - HS_FLAG_DOTALL - Matching a `.` will not exclude newlines.
+ *       - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
+ *       - HS_FLAG_SINGLEMATCH - Only one match will be generated by the
+ *                               expression per stream.
+ *       - HS_FLAG_ALLOWEMPTY - Allow expressions which can match against an
+ *                              empty string, such as `.*`.
+ *       - HS_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
+ *       - HS_FLAG_UCP - Use Unicode properties for character classes.
+ *       - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
+ *       - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
+ *                                when a match is found.
+ *       - HS_FLAG_COMBINATION - Parse the expression in logical combination
+ *                               syntax.
+ *       - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for
+ *                         the sub-expressions in logical combinations.
+ *
+ * @param info
+ *      On success, a pointer to the pattern information will be returned in
+ *      this parameter, or NULL on failure. This structure is allocated using
+ *      the allocator supplied in @ref hs_set_allocator() (or malloc() if no
+ *      allocator was set) and should be freed by the caller.
+ *
+ * @param error
+ *      If the call fails, a pointer to a @ref hs_compile_error_t will be
+ *      returned, providing details of the error condition. The caller is
+ *      responsible for deallocating the buffer using the @ref
+ *      hs_free_compile_error() function.
+ *
+ * @return
+ *      @ref HS_SUCCESS is returned on successful compilation; @ref
+ *      HS_COMPILER_ERROR on failure, with details provided in the error
+ *      parameter.
+ */
+hs_error_t HS_CDECL hs_expression_info(const char *expression,
+                                       unsigned int flags,
+                                       hs_expr_info_t **info,
+                                       hs_compile_error_t **error);
+
+/**
+ * Utility function providing information about a regular expression, with
+ * extended parameter support. The information provided in @ref hs_expr_info_t
+ * includes the minimum and maximum width of a pattern match.
+ *
+ * Note: successful analysis of an expression with this function does not imply
+ * that compilation of the same expression (via @ref hs_compile(), @ref
+ * hs_compile_multi() or @ref hs_compile_ext_multi()) would succeed. This
+ * function may return @ref HS_SUCCESS for regular expressions that Hyperscan
+ * cannot compile.
+ *
+ * Note: some per-pattern flags (such as @ref HS_FLAG_ALLOWEMPTY, @ref
+ * HS_FLAG_SOM_LEFTMOST) are accepted by this call, but as they do not affect
+ * the properties returned in the @ref hs_expr_info_t structure, they will not
+ * affect the outcome of this function.
+ *
+ * @param expression
+ *      The NULL-terminated expression to parse. Note that this string must
+ *      represent ONLY the pattern to be matched, with no delimiters or flags;
+ *      any global flags should be specified with the @p flags argument.  For
+ *      example, the expression `/abc?def/i` should be compiled by providing
+ *      `abc?def` as the @p expression, and @ref HS_FLAG_CASELESS as the @a
+ *      flags.
+ *
+ * @param flags
+ *      Flags which modify the behaviour of the expression. Multiple flags may
+ *      be used by ORing them together. Valid values are:
+ *       - HS_FLAG_CASELESS - Matching will be performed case-insensitively.
+ *       - HS_FLAG_DOTALL - Matching a `.` will not exclude newlines.
+ *       - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
+ *       - HS_FLAG_SINGLEMATCH - Only one match will be generated by the
+ *                               expression per stream.
+ *       - HS_FLAG_ALLOWEMPTY - Allow expressions which can match against an
+ *                              empty string, such as `.*`.
+ *       - HS_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
+ *       - HS_FLAG_UCP - Use Unicode properties for character classes.
+ *       - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
+ *       - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
+ *                                when a match is found.
+ *       - HS_FLAG_COMBINATION - Parse the expression in logical combination
+ *                               syntax.
+ *       - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for
+ *                         the sub-expressions in logical combinations.
+ *
+ * @param ext
+ *      A pointer to a filled @ref hs_expr_ext_t structure that defines
+ *      extended behaviour for this pattern. NULL may be specified if no
+ *      extended parameters are needed.
+ *
+ * @param info
+ *      On success, a pointer to the pattern information will be returned in
+ *      this parameter, or NULL on failure. This structure is allocated using
+ *      the allocator supplied in @ref hs_set_allocator() (or malloc() if no
+ *      allocator was set) and should be freed by the caller.
+ *
+ * @param error
+ *      If the call fails, a pointer to a @ref hs_compile_error_t will be
+ *      returned, providing details of the error condition. The caller is
+ *      responsible for deallocating the buffer using the @ref
+ *      hs_free_compile_error() function.
+ *
+ * @return
+ *      @ref HS_SUCCESS is returned on successful compilation; @ref
+ *      HS_COMPILER_ERROR on failure, with details provided in the error
+ *      parameter.
+ */
+hs_error_t HS_CDECL hs_expression_ext_info(const char *expression,
+                                           unsigned int flags,
+                                           const hs_expr_ext_t *ext,
+                                           hs_expr_info_t **info,
+                                           hs_compile_error_t **error);
+
+/**
+ * Populates the platform information based on the current host.
+ *
+ * @param platform
+ *      On success, the pointed to structure is populated based on the current
+ *      host.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t HS_CDECL hs_populate_platform(hs_platform_info_t *platform);
+
+/**
+ * @defgroup HS_PATTERN_FLAG Pattern flags
+ *
+ * @{
+ */
+
+/**
+ * Compile flag: Set case-insensitive matching.
+ *
+ * This flag sets the expression to be matched case-insensitively by default.
+ * The expression may still use PCRE tokens (notably `(?i)` and
+ * `(?-i)`) to switch case-insensitive matching on and off.
+ */
+#define HS_FLAG_CASELESS        1
+
+/**
+ * Compile flag: Matching a `.` will not exclude newlines.
+ *
+ * This flag sets any instances of the `.` token to match newline characters as
+ * well as all other characters. The PCRE specification states that the `.`
+ * token does not match newline characters by default, so without this flag the
+ * `.` token will not cross line boundaries.
+ */
+#define HS_FLAG_DOTALL          2
+
+/**
+ * Compile flag: Set multi-line anchoring.
+ *
+ * This flag instructs the expression to make the `^` and `$` tokens match
+ * newline characters as well as the start and end of the stream. If this flag
+ * is not specified, the `^` token will only ever match at the start of a
+ * stream, and the `$` token will only ever match at the end of a stream within
+ * the guidelines of the PCRE specification.
+ */
+#define HS_FLAG_MULTILINE       4
+
+/**
+ * Compile flag: Set single-match only mode.
+ *
+ * This flag sets the expression's match ID to match at most once. In streaming
+ * mode, this means that the expression will return only a single match over
+ * the lifetime of the stream, rather than reporting every match as per
+ * standard Hyperscan semantics. In block mode or vectored mode, only the first
+ * match for each invocation of @ref hs_scan() or @ref hs_scan_vector() will be
+ * returned.
+ *
+ * If multiple expressions in the database share the same match ID, then they
+ * either must all specify @ref HS_FLAG_SINGLEMATCH or none of them specify
+ * @ref HS_FLAG_SINGLEMATCH. If a group of expressions sharing a match ID
+ * specify the flag, then at most one match with the match ID will be generated
+ * per stream.
+ *
+ * Note: The use of this flag in combination with @ref HS_FLAG_SOM_LEFTMOST
+ * is not currently supported.
+ */
+#define HS_FLAG_SINGLEMATCH     8
+
+/**
+ * Compile flag: Allow expressions that can match against empty buffers.
+ *
+ * This flag instructs the compiler to allow expressions that can match against
+ * empty buffers, such as `.?`, `.*`, `(a|)`. Since Hyperscan can return every
+ * possible match for an expression, such expressions generally execute very
+ * slowly; the default behaviour is to return an error when an attempt to
+ * compile one is made. Using this flag will force the compiler to allow such
+ * an expression.
+ */
+#define HS_FLAG_ALLOWEMPTY      16
+
+/**
+ * Compile flag: Enable UTF-8 mode for this expression.
+ *
+ * This flag instructs Hyperscan to treat the pattern as a sequence of UTF-8
+ * characters. The results of scanning invalid UTF-8 sequences with a Hyperscan
+ * library that has been compiled with one or more patterns using this flag are
+ * undefined.
+ */
+#define HS_FLAG_UTF8            32
+
+/**
+ * Compile flag: Enable Unicode property support for this expression.
+ *
+ * This flag instructs Hyperscan to use Unicode properties, rather than the
+ * default ASCII interpretations, for character mnemonics like `\w` and `\s` as
+ * well as the POSIX character classes. It is only meaningful in conjunction
+ * with @ref HS_FLAG_UTF8.
+ */
+#define HS_FLAG_UCP             64
+
+/**
+ * Compile flag: Enable prefiltering mode for this expression.
+ *
+ * This flag instructs Hyperscan to compile an "approximate" version of this
+ * pattern for use in a prefiltering application, even if Hyperscan does not
+ * support the pattern in normal operation.
+ *
+ * The set of matches returned when this flag is used is guaranteed to be a
+ * superset of the matches specified by the non-prefiltering expression.
+ *
+ * If the pattern contains pattern constructs not supported by Hyperscan (such
+ * as zero-width assertions, back-references or conditional references) these
+ * constructs will be replaced internally with broader constructs that may
+ * match more often.
+ *
+ * Furthermore, in prefiltering mode Hyperscan may simplify a pattern that
+ * would otherwise return a "Pattern too large" error at compile time, or for
+ * performance reasons (subject to the matching guarantee above).
+ *
+ * It is generally expected that the application will subsequently confirm
+ * prefilter matches with another regular expression matcher that can provide
+ * exact matches for the pattern.
+ *
+ * Note: The use of this flag in combination with @ref HS_FLAG_SOM_LEFTMOST
+ * is not currently supported.
+ */
+#define HS_FLAG_PREFILTER       128
+
+/**
+ * Compile flag: Enable leftmost start of match reporting.
+ *
+ * This flag instructs Hyperscan to report the leftmost possible start of match
+ * offset when a match is reported for this expression. (By default, no start
+ * of match is returned.)
+ *
+ * For all the 3 modes, enabling this behaviour may reduce performance. And
+ * particularly, it may increase stream state requirements in streaming mode.
+ */
+#define HS_FLAG_SOM_LEFTMOST    256
+
+/**
+ * Compile flag: Logical combination.
+ *
+ * This flag instructs Hyperscan to parse this expression as logical
+ * combination syntax.
+ * Logical constraints consist of operands, operators and parentheses.
+ * The operands are expression indices, and operators can be
+ * '!'(NOT), '&'(AND) or '|'(OR).
+ * For example:
+ *     (101&102&103)|(104&!105)
+ *     ((301|302)&303)&(304|305)
+ */
+#define HS_FLAG_COMBINATION     512
+
+/**
+ * Compile flag: Don't do any match reporting.
+ *
+ * This flag instructs Hyperscan to ignore match reporting for this expression.
+ * It is designed to be used on the sub-expressions in logical combinations.
+ */
+#define HS_FLAG_QUIET           1024
+
+/** @} */
+
+/**
+ * @defgroup HS_CPU_FEATURES_FLAG CPU feature support flags
+ *
+ * @{
+ */
+
+/**
+ * CPU features flag - Intel(R) Advanced Vector Extensions 2 (Intel(R) AVX2)
+ *
+ * Setting this flag indicates that the target platform supports AVX2
+ * instructions.
+ */
+#define HS_CPU_FEATURES_AVX2             (1ULL << 2)
+
+/**
+ * CPU features flag - Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX512)
+ *
+ * Setting this flag indicates that the target platform supports AVX512
+ * instructions, specifically AVX-512BW. Using AVX512 implies the use of AVX2.
+ */
+#define HS_CPU_FEATURES_AVX512           (1ULL << 3)
+
+/**
+ * CPU features flag - Intel(R) Advanced Vector Extensions 512
+ * Vector Byte Manipulation Instructions (Intel(R) AVX512VBMI)
+ *
+ * Setting this flag indicates that the target platform supports AVX512VBMI
+ * instructions. Using AVX512VBMI implies the use of AVX512.
+ */
+#define HS_CPU_FEATURES_AVX512VBMI       (1ULL << 4)
+
+/** @} */
+
+/**
+ * @defgroup HS_TUNE_FLAG Tuning flags
+ *
+ * @{
+ */
+
+/**
+ * Tuning Parameter - Generic
+ *
+ * This indicates that the compiled database should not be tuned for any
+ * particular target platform.
+ */
+#define HS_TUNE_FAMILY_GENERIC 0
+
+/**
+ * Tuning Parameter - Intel(R) microarchitecture code name Sandy Bridge
+ *
+ * This indicates that the compiled database should be tuned for the
+ * Sandy Bridge microarchitecture.
+ */
+#define HS_TUNE_FAMILY_SNB 1
+
+/**
+ * Tuning Parameter - Intel(R) microarchitecture code name Ivy Bridge
+ *
+ * This indicates that the compiled database should be tuned for the
+ * Ivy Bridge microarchitecture.
+ */
+#define HS_TUNE_FAMILY_IVB 2
+
+/**
+ * Tuning Parameter - Intel(R) microarchitecture code name Haswell
+ *
+ * This indicates that the compiled database should be tuned for the
+ * Haswell microarchitecture.
+ */
+#define HS_TUNE_FAMILY_HSW 3
+
+/**
+ * Tuning Parameter - Intel(R) microarchitecture code name Silvermont
+ *
+ * This indicates that the compiled database should be tuned for the
+ * Silvermont microarchitecture.
+ */
+#define HS_TUNE_FAMILY_SLM 4
+
+/**
+ * Tuning Parameter - Intel(R) microarchitecture code name Broadwell
+ *
+ * This indicates that the compiled database should be tuned for the
+ * Broadwell microarchitecture.
+ */
+#define HS_TUNE_FAMILY_BDW 5
+
+/**
+ * Tuning Parameter - Intel(R) microarchitecture code name Skylake
+ *
+ * This indicates that the compiled database should be tuned for the
+ * Skylake microarchitecture.
+ */
+#define HS_TUNE_FAMILY_SKL 6
+
+/**
+ * Tuning Parameter - Intel(R) microarchitecture code name Skylake Server
+ *
+ * This indicates that the compiled database should be tuned for the
+ * Skylake Server microarchitecture.
+ */
+#define HS_TUNE_FAMILY_SKX 7
+
+/**
+ * Tuning Parameter - Intel(R) microarchitecture code name Goldmont
+ *
+ * This indicates that the compiled database should be tuned for the
+ * Goldmont microarchitecture.
+ */
+#define HS_TUNE_FAMILY_GLM 8
+
+/**
+ * Tuning Parameter - Intel(R) microarchitecture code name Icelake
+ *
+ * This indicates that the compiled database should be tuned for the
+ * Icelake microarchitecture.
+ */
+#define HS_TUNE_FAMILY_ICL 9
+
+/**
+ * Tuning Parameter - Intel(R) microarchitecture code name Icelake Server
+ *
+ * This indicates that the compiled database should be tuned for the
+ * Icelake Server microarchitecture.
+ */
+#define HS_TUNE_FAMILY_ICX 10
+
+/** @} */
+
+/**
+ * @defgroup HS_MODE_FLAG Compile mode flags
+ *
+ * The mode flags are used as values for the mode parameter of the various
+ * compile calls (@ref hs_compile(), @ref hs_compile_multi() and @ref
+ * hs_compile_ext_multi()).
+ *
+ * A mode value can be built by ORing these flag values together; the only
+ * required flag is one of @ref HS_MODE_BLOCK, @ref HS_MODE_STREAM or @ref
+ * HS_MODE_VECTORED. Other flags may be added to enable support for additional
+ * features.
+ *
+ *  @{
+ */
+
+/**
+ * Compiler mode flag: Block scan (non-streaming) database.
+ */
+#define HS_MODE_BLOCK           1
+
+/**
+ * Compiler mode flag: Alias for @ref HS_MODE_BLOCK.
+ */
+#define HS_MODE_NOSTREAM        1
+
+/**
+ * Compiler mode flag: Streaming database.
+ */
+#define HS_MODE_STREAM          2
+
+/**
+ * Compiler mode flag: Vectored scanning database.
+ */
+#define HS_MODE_VECTORED        4
+
+/**
+ * Compiler mode flag: use full precision to track start of match offsets in
+ * stream state.
+ *
+ * This mode will use the most stream state per pattern, but will always return
+ * an accurate start of match offset regardless of how far back in the past it
+ * was found.
+ *
+ * One of the SOM_HORIZON modes must be selected to use the @ref
+ * HS_FLAG_SOM_LEFTMOST expression flag.
+ */
+#define HS_MODE_SOM_HORIZON_LARGE   (1U << 24)
+
+/**
+ * Compiler mode flag: use medium precision to track start of match offsets in
+ * stream state.
+ *
+ * This mode will use less stream state than @ref HS_MODE_SOM_HORIZON_LARGE and
+ * will limit start of match accuracy to offsets within 2^32 bytes of the
+ * end of match offset reported.
+ *
+ * One of the SOM_HORIZON modes must be selected to use the @ref
+ * HS_FLAG_SOM_LEFTMOST expression flag.
+ */
+#define HS_MODE_SOM_HORIZON_MEDIUM  (1U << 25)
+
+/**
+ * Compiler mode flag: use limited precision to track start of match offsets in
+ * stream state.
+ *
+ * This mode will use less stream state than @ref HS_MODE_SOM_HORIZON_LARGE and
+ * will limit start of match accuracy to offsets within 2^16 bytes of the
+ * end of match offset reported.
+ *
+ * One of the SOM_HORIZON modes must be selected to use the @ref
+ * HS_FLAG_SOM_LEFTMOST expression flag.
+ */
+#define HS_MODE_SOM_HORIZON_SMALL   (1U << 26)
+
+/** @} */
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* HS_COMPILE_H_ */
diff --git a/regex/hs_internal.h b/regex/hs_internal.h
new file mode 100644
index 000000000..adf07b22c
--- /dev/null
+++ b/regex/hs_internal.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2019, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Internal-use only definitions. Available to internal tools.
+ */
+
+#ifndef HS_INTERNAL_H
+#define HS_INTERNAL_H
+
+#include "ue2common.h"
+#include "hs.h"
+
+#ifdef __cplusplus
+
+namespace ue2 {
+
+struct Grey;
+
+/** \brief Internal use only: takes a Grey argument so that we can use it in
+ * tools. */
+hs_error_t hs_compile_multi_int(const char *const *expressions,
+                                const unsigned *flags, const unsigned *ids,
+                                const hs_expr_ext *const *ext,
+                                unsigned elements, unsigned mode,
+                                const hs_platform_info_t *platform,
+                                hs_database_t **db,
+                                hs_compile_error_t **comp_error, const Grey &g);
+
+/** \brief Internal use only: takes a Grey argument so that we can use it in
+ * tools. */
+hs_error_t hs_compile_lit_multi_int(const char *const *expressions,
+                                    const unsigned *flags, const unsigned *ids,
+                                    const hs_expr_ext *const *ext,
+                                    const size_t *lens, unsigned elements,
+                                    unsigned mode,
+                                    const hs_platform_info_t *platform,
+                                    hs_database_t **db,
+                                    hs_compile_error_t **comp_error,
+                                    const Grey &g);
+} // namespace ue2
+
+extern "C"
+{
+#endif
+
+#define HS_MATCH_FLAG_ADJUSTED  1U
+
+/** \brief Bitmask of all valid Hyperscan flags. */
+#define HS_FLAG_ALL ( HS_FLAG_CASELESS \
+                    | HS_FLAG_DOTALL \
+                    | HS_FLAG_MULTILINE \
+                    | HS_FLAG_UTF8 \
+                    | HS_FLAG_UCP \
+                    | HS_FLAG_PREFILTER \
+                    | HS_FLAG_SINGLEMATCH \
+                    | HS_FLAG_ALLOWEMPTY \
+                    | HS_FLAG_SOM_LEFTMOST)
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif
diff --git a/regex/hs_runtime.h b/regex/hs_runtime.h
new file mode 100644
index 000000000..c757aa2c9
--- /dev/null
+++ b/regex/hs_runtime.h
@@ -0,0 +1,683 @@
+/*
+ * Copyright (c) 2015-2018, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef HS_RUNTIME_H_
+#define HS_RUNTIME_H_
+
+#ifndef __KERNEL__
+#include <stdlib.h>
+#else
+#include <linux/types.h>
+#endif
+
+/**
+ * @file
+ * @brief The Hyperscan runtime API definition.
+ *
+ * Hyperscan is a high speed regular expression engine.
+ *
+ * This header contains functions for using compiled Hyperscan databases for
+ * scanning data at runtime.
+ */
+
+#include "hs_common.h"
+//#include "fw/str.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/**
+ * Definition of the stream identifier type.
+ */
+struct hs_stream;
+
+/**
+ * The stream identifier returned by @ref hs_open_stream().
+ */
+typedef struct hs_stream hs_stream_t;
+
+struct hs_scratch;
+
+/**
+ * A Hyperscan scratch space.
+ */
+typedef struct hs_scratch hs_scratch_t;
+
+/**
+ * Definition of the match event callback function type.
+ *
+ * A callback function matching the defined type must be provided by the
+ * application calling the @ref hs_scan(), @ref hs_scan_vector() or @ref
+ * hs_scan_stream() functions (or other streaming calls which can produce
+ * matches).
+ *
+ * This callback function will be invoked whenever a match is located in the
+ * target data during the execution of a scan. The details of the match are
+ * passed in as parameters to the callback function, and the callback function
+ * should return a value indicating whether or not matching should continue on
+ * the target data. If no callbacks are desired from a scan call, NULL may be
+ * provided in order to suppress match production.
+ *
+ * This callback function should not attempt to call Hyperscan API functions on
+ * the same stream nor should it attempt to reuse the scratch space allocated
+ * for the API calls that caused it to be triggered. Making another call to the
+ * Hyperscan library with completely independent parameters should work (for
+ * example, scanning a different database in a new stream and with new scratch
+ * space), but reusing data structures like stream state and/or scratch space
+ * will produce undefined behavior.
+ *
+ * @param id
+ *      The ID number of the expression that matched. If the expression was a
+ *      single expression compiled with @ref hs_compile(), this value will be
+ *      zero.
+ *
+ * @param from
+ *      - If a start of match flag is enabled for the current pattern, this
+ *        argument will be set to the start of match for the pattern assuming
+ *        that that start of match value lies within the current 'start of match
+ *        horizon' chosen by one of the SOM_HORIZON mode flags.
+
+ *      - If the start of match value lies outside this horizon (possible only
+ *        when the SOM_HORIZON value is not @ref HS_MODE_SOM_HORIZON_LARGE),
+ *        the @p from value will be set to @ref HS_OFFSET_PAST_HORIZON.
+
+ *      - This argument will be set to zero if the Start of Match flag is not
+ *        enabled for the given pattern.
+ *
+ * @param to
+ *      The offset after the last byte that matches the expression.
+ *
+ * @param flags
+ *      This is provided for future use and is unused at present.
+ *
+ * @param context
+ *      The pointer supplied by the user to the @ref hs_scan(), @ref
+ *      hs_scan_vector() or @ref hs_scan_stream() function.
+ *
+ * @return
+ *      Non-zero if the matching should cease, else zero. If scanning is
+ *      performed in streaming mode and a non-zero value is returned, any
+ *      subsequent calls to @ref hs_scan_stream() for that stream will
+ *      immediately return with @ref HS_SCAN_TERMINATED.
+ */
+typedef int (HS_CDECL *match_event_handler)(unsigned int id,
+                                            unsigned long long from,
+                                            unsigned long long to,
+                                            unsigned int flags,
+                                            void *context);
+
+/**
+ * Open and initialise a stream.
+ *
+ * @param db
+ *      A compiled pattern database.
+ *
+ * @param flags
+ *      Flags modifying the behaviour of the stream. This parameter is provided
+ *      for future use and is unused at present.
+ *
+ * @param stream
+ *      On success, a pointer to the generated @ref hs_stream_t will be
+ *      returned; NULL on failure.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t HS_CDECL hs_open_stream(const hs_database_t *db, unsigned int flags,
+                                   hs_stream_t **stream);
+
+/**
+ * Write data to be scanned to the opened stream.
+ *
+ * This is the function call in which the actual pattern matching takes place
+ * as data is written to the stream. Matches will be returned via the @ref
+ * match_event_handler callback supplied.
+ *
+ * @param id
+ *      The stream ID (returned by @ref hs_open_stream()) to which the data
+ *      will be written.
+ *
+ * @param data
+ *      Pointer to the data to be scanned.
+ *
+ * @param length
+ *      The number of bytes to scan.
+ *
+ * @param flags
+ *      Flags modifying the behaviour of the stream. This parameter is provided
+ *      for future use and is unused at present.
+ *
+ * @param scratch
+ *      A per-thread scratch space allocated by @ref hs_alloc_scratch().
+ *
+ * @param onEvent
+ *      Pointer to a match event callback function. If a NULL pointer is given,
+ *      no matches will be returned.
+ *
+ * @param ctxt
+ *      The user defined pointer which will be passed to the callback function
+ *      when a match occurs.
+ *
+ * @return
+ *      Returns @ref HS_SUCCESS on success; @ref HS_SCAN_TERMINATED if the
+ *      match callback indicated that scanning should stop; other values on
+ *      error.
+ */
+hs_error_t HS_CDECL hs_scan_stream(hs_stream_t *id, const char *data,
+                                   unsigned int length, unsigned int flags,
+                                   hs_scratch_t *scratch,
+                                   match_event_handler onEvent, void *ctxt);
+
+/**
+ * Close a stream.
+ *
+ * This function completes matching on the given stream and frees the memory
+ * associated with the stream state. After this call, the stream pointed to by
+ * @p id is invalid and can no longer be used. To reuse the stream state after
+ * completion, rather than closing it, the @ref hs_reset_stream function can be
+ * used.
+ *
+ * This function must be called for any stream created with @ref
+ * hs_open_stream(), even if scanning has been terminated by a non-zero return
+ * from the match callback function.
+ *
+ * Note: This operation may result in matches being returned (via calls to the
+ * match event callback) for expressions anchored to the end of the data stream
+ * (for example, via the use of the `$` meta-character). If these matches are
+ * not desired, NULL may be provided as the @ref match_event_handler callback.
+ *
+ * If NULL is provided as the @ref match_event_handler callback, it is
+ * permissible to provide a NULL scratch.
+ *
+ * @param id
+ *      The stream ID returned by @ref hs_open_stream().
+ *
+ * @param scratch
+ *      A per-thread scratch space allocated by @ref hs_alloc_scratch(). This is
+ *      allowed to be NULL only if the @p onEvent callback is also NULL.
+ *
+ * @param onEvent
+ *      Pointer to a match event callback function. If a NULL pointer is given,
+ *      no matches will be returned.
+ *
+ * @param ctxt
+ *      The user defined pointer which will be passed to the callback function
+ *      when a match occurs.
+ *
+ * @return
+ *      Returns @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t HS_CDECL hs_close_stream(hs_stream_t *id, hs_scratch_t *scratch,
+                                    match_event_handler onEvent, void *ctxt);
+
+/**
+ * Reset a stream to an initial state.
+ *
+ * Conceptually, this is equivalent to performing @ref hs_close_stream() on the
+ * given stream, followed by a @ref hs_open_stream(). This new stream replaces
+ * the original stream in memory, avoiding the overhead of freeing the old
+ * stream and allocating the new one.
+ *
+ * Note: This operation may result in matches being returned (via calls to the
+ * match event callback) for expressions anchored to the end of the original
+ * data stream (for example, via the use of the `$` meta-character). If these
+ * matches are not desired, NULL may be provided as the @ref match_event_handler
+ * callback.
+ *
+ * Note: the stream will also be tied to the same database.
+ *
+ * @param id
+ *      The stream (as created by @ref hs_open_stream()) to be replaced.
+ *
+ * @param flags
+ *      Flags modifying the behaviour of the stream. This parameter is provided
+ *      for future use and is unused at present.
+ *
+ * @param scratch
+ *      A per-thread scratch space allocated by @ref hs_alloc_scratch(). This is
+ *      allowed to be NULL only if the @p onEvent callback is also NULL.
+ *
+ * @param onEvent
+ *      Pointer to a match event callback function. If a NULL pointer is given,
+ *      no matches will be returned.
+ *
+ * @param context
+ *      The user defined pointer which will be passed to the callback function
+ *      when a match occurs.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t HS_CDECL hs_reset_stream(hs_stream_t *id, unsigned int flags,
+                                    hs_scratch_t *scratch,
+                                    match_event_handler onEvent, void *context);
+
+/**
+ * Duplicate the given stream. The new stream will have the same state as the
+ * original including the current stream offset.
+ *
+ * @param to_id
+ *      On success, a pointer to the new, copied @ref hs_stream_t will be
+ *      returned; NULL on failure.
+ *
+ * @param from_id
+ *      The stream (as created by @ref hs_open_stream()) to be copied.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t HS_CDECL hs_copy_stream(hs_stream_t **to_id,
+                                   const hs_stream_t *from_id);
+
+/**
+ * Duplicate the given 'from' stream state onto the 'to' stream. The 'to' stream
+ * will first be reset (reporting any EOD matches if a non-NULL @p onEvent
+ * callback handler is provided).
+ *
+ * Note: the 'to' stream and the 'from' stream must be open against the same
+ * database.
+ *
+ * @param to_id
+ *      On success, a pointer to the new, copied @ref hs_stream_t will be
+ *      returned; NULL on failure.
+ *
+ * @param from_id
+ *      The stream (as created by @ref hs_open_stream()) to be copied.
+ *
+ * @param scratch
+ *      A per-thread scratch space allocated by @ref hs_alloc_scratch(). This is
+ *      allowed to be NULL only if the @p onEvent callback is also NULL.
+ *
+ * @param onEvent
+ *      Pointer to a match event callback function. If a NULL pointer is given,
+ *      no matches will be returned.
+ *
+ * @param context
+ *      The user defined pointer which will be passed to the callback function
+ *      when a match occurs.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t HS_CDECL hs_reset_and_copy_stream(hs_stream_t *to_id,
+                                             const hs_stream_t *from_id,
+                                             hs_scratch_t *scratch,
+                                             match_event_handler onEvent,
+                                             void *context);
+
+/**
+ * Creates a compressed representation of the provided stream in the buffer
+ * provided. This compressed representation can be converted back into a stream
+ * state by using @ref hs_expand_stream() or @ref hs_reset_and_expand_stream().
+ * The size of the compressed representation will be placed into @p used_space.
+ *
+ * If there is not sufficient space in the buffer to hold the compressed
+ * representation, @ref HS_INSUFFICIENT_SPACE will be returned and @p used_space
+ * will be populated with the amount of space required.
+ *
+ * Note: this function does not close the provided stream, you may continue to
+ * use the stream or to free it with @ref hs_close_stream().
+ *
+ * @param stream
+ *      The stream (as created by @ref hs_open_stream()) to be compressed.
+ *
+ * @param buf
+ *      Buffer to write the compressed representation into. Note: if the call is
+ *      just being used to determine the amount of space required, it is allowed
+ *      to pass NULL here and @p buf_space as 0.
+ *
+ * @param buf_space
+ *      The number of bytes in @p buf. If buf_space is too small, the call will
+ *      fail with @ref HS_INSUFFICIENT_SPACE.
+ *
+ * @param used_space
+ *      Pointer to where the amount of used space will be written to. The used
+ *      buffer space is always less than or equal to @p buf_space. If the call
+ *      fails with @ref HS_INSUFFICIENT_SPACE, this pointer will be used to
+ *      write out the amount of buffer space required.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, @ref HS_INSUFFICIENT_SPACE if the provided
+ *      buffer is too small.
+ */
+hs_error_t HS_CDECL hs_compress_stream(const hs_stream_t *stream, char *buf,
+                                       size_t buf_space, size_t *used_space);
+
+/**
+ * Decompresses a compressed representation created by @ref hs_compress_stream()
+ * into a new stream.
+ *
+ * Note: @p buf must correspond to a complete compressed representation created
+ * by @ref hs_compress_stream() of a stream that was opened against @p db. It is
+ * not always possible to detect misuse of this API and behaviour is undefined
+ * if these properties are not satisfied.
+ *
+ * @param db
+ *      The compiled pattern database that the compressed stream was opened
+ *      against.
+ *
+ * @param stream
+ *      On success, a pointer to the expanded @ref hs_stream_t will be
+ *      returned; NULL on failure.
+ *
+ * @param buf
+ *      A compressed representation of a stream. These compressed forms are
+ *      created by @ref hs_compress_stream().
+ *
+ * @param buf_size
+ *      The size in bytes of the compressed representation.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t HS_CDECL hs_expand_stream(const hs_database_t *db,
+                                     hs_stream_t **stream, const char *buf,
+                                     size_t buf_size);
+
+/**
+ * Decompresses a compressed representation created by @ref hs_compress_stream()
+ * on top of the 'to' stream. The 'to' stream will first be reset (reporting
+ * any EOD matches if a non-NULL @p onEvent callback handler is provided).
+ *
+ * Note: the 'to' stream must be opened against the same database as the
+ * compressed stream.
+ *
+ * Note: @p buf must correspond to a complete compressed representation created
+ * by @ref hs_compress_stream() of a stream that was opened against @p db. It is
+ * not always possible to detect misuse of this API and behaviour is undefined
+ * if these properties are not satisfied.
+ *
+ * @param to_stream
+ *      A pointer to a valid stream state. A pointer to the expanded @ref
+ *      hs_stream_t will be returned; NULL on failure.
+ *
+ * @param buf
+ *      A compressed representation of a stream. These compressed forms are
+ *      created by @ref hs_compress_stream().
+ *
+ * @param buf_size
+ *      The size in bytes of the compressed representation.
+ *
+ * @param scratch
+ *      A per-thread scratch space allocated by @ref hs_alloc_scratch(). This is
+ *      allowed to be NULL only if the @p onEvent callback is also NULL.
+ *
+ * @param onEvent
+ *      Pointer to a match event callback function. If a NULL pointer is given,
+ *      no matches will be returned.
+ *
+ * @param context
+ *      The user defined pointer which will be passed to the callback function
+ *      when a match occurs.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t HS_CDECL hs_reset_and_expand_stream(hs_stream_t *to_stream,
+                                               const char *buf, size_t buf_size,
+                                               hs_scratch_t *scratch,
+                                               match_event_handler onEvent,
+                                               void *context);
+
+/**
+ * The block (non-streaming) regular expression scanner.
+ *
+ * This is the function call in which the actual pattern matching takes place
+ * for block-mode pattern databases.
+ *
+ * @param db
+ *      A compiled pattern database.
+ *
+ * @param data
+ *      Pointer to the data to be scanned.
+ *
+ * @param length
+ *      The number of bytes to scan.
+ *
+ * @param flags
+ *      Flags modifying the behaviour of this function. This parameter is
+ *      provided for future use and is unused at present.
+ *
+ * @param scratch
+ *      A per-thread scratch space allocated by @ref hs_alloc_scratch() for this
+ *      database.
+ *
+ * @param onEvent
+ *      Pointer to a match event callback function. If a NULL pointer is given,
+ *      no matches will be returned.
+ *
+ * @param context
+ *      The user defined pointer which will be passed to the callback function.
+ *
+ * @return
+ *      Returns @ref HS_SUCCESS on success; @ref HS_SCAN_TERMINATED if the
+ *      match callback indicated that scanning should stop; other values on
+ *      error.
+ */
+hs_error_t HS_CDECL hs_scan(const hs_database_t *db, const char *data,
+                            unsigned int length, unsigned int flags,
+                            hs_scratch_t *scratch, match_event_handler onEvent,
+                            void *context);
+
+/**
+ * The vectored regular expression scanner.
+ *
+ * This is the function call in which the actual pattern matching takes place
+ * for vectoring-mode pattern databases.
+ *
+ * @param db
+ *      A compiled pattern database.
+ *
+ * @param data
+ *      An array of pointers to the data blocks to be scanned.
+ *
+ * @param length
+ *      An array of lengths (in bytes) of each data block to scan.
+ *
+ * @param count
+ *      Number of data blocks to scan. This should correspond to the size of
+ *      of the @p data and @p length arrays.
+ *
+ * @param flags
+ *      Flags modifying the behaviour of this function. This parameter is
+ *      provided for future use and is unused at present.
+ *
+ * @param scratch
+ *      A per-thread scratch space allocated by @ref hs_alloc_scratch() for
+ *      this database.
+ *
+ * @param onEvent
+ *      Pointer to a match event callback function. If a NULL pointer is given,
+ *      no matches will be returned.
+ *
+ * @param context
+ *      The user defined pointer which will be passed to the callback function.
+ *
+ * @return
+ *      Returns @ref HS_SUCCESS on success; @ref HS_SCAN_TERMINATED if the match
+ *      callback indicated that scanning should stop; other values on error.
+ */
+
+hs_error_t HS_CDECL hs_scan_vector(const hs_database_t *db,
+                                   const char *const *data,
+                                   const unsigned int *length,
+                                   unsigned int count, unsigned int flags,
+                                   hs_scratch_t *scratch,
+                                   match_event_handler onEvent, void *context);
+
+
+
+
+/**
+ * The vectored regular expression scanner.
+ *
+ * This is the function call in which the actual pattern matching takes place
+ * for vectoring-mode pattern databases.
+ *
+ * @param db
+ *      A compiled pattern database.
+ *
+ * @param data
+ *      TfwStr chunked string.
+ *
+ * @param flags
+ *      Flags modifying the behaviour of this function. This parameter is
+ *      provided for future use and is unused at present.
+ *
+ * @param scratch
+ *      A per-thread scratch space allocated by @ref hs_alloc_scratch() for
+ *      this database.
+ *
+ * @param onEvent
+ *      Pointer to a match event callback function. If a NULL pointer is given,
+ *      no matches will be returned.
+ *
+ * @param context
+ *      The user defined pointer which will be passed to the callback function.
+ *
+ * @return
+ *      Returns @ref HS_SUCCESS on success; @ref HS_SCAN_TERMINATED if the match
+ *      callback indicated that scanning should stop; other values on error.
+ */
+
+hs_error_t HS_CDECL hs_scan_tfwstr(const hs_database_t *db,
+                                   const void *data,/*TfwStr*/
+                                   unsigned int flags,
+                                   hs_scratch_t *scratch,
+                                   match_event_handler onEvent, void *context);
+
+/**
+ * Allocate a "scratch" space for use by Hyperscan.
+ *
+ * This is required for runtime use, and one scratch space per thread, or
+ * concurrent caller, is required. Any allocator callback set by @ref
+ * hs_set_scratch_allocator() or @ref hs_set_allocator() will be used by this
+ * function.
+ *
+ * @param db
+ *      The database, as produced by @ref hs_compile().
+ *
+ * @param scratch
+ *      On first allocation, a pointer to NULL should be provided so a new
+ *      scratch can be allocated. If a scratch block has been previously
+ *      allocated, then a pointer to it should be passed back in to see if it
+ *      is valid for this database block. If a new scratch block is required,
+ *      the original will be freed and the new one returned, otherwise the
+ *      previous scratch block will be returned. On success, the scratch block
+ *      will be suitable for use with the provided database in addition to any
+ *      databases that original scratch space was suitable for.
+ *
+ * @return
+ *      @ref HS_SUCCESS on successful allocation; @ref HS_NOMEM if the
+ *      allocation fails.  Other errors may be returned if invalid parameters
+ *      are specified.
+ */
+hs_error_t HS_CDECL hs_alloc_scratch(const hs_database_t *db,
+                                     hs_scratch_t **scratch);
+
+/**
+ * Allocate a scratch space that is a clone of an existing scratch space.
+ *
+ * This is useful when multiple concurrent threads will be using the same set
+ * of compiled databases, and another scratch space is required. Any allocator
+ * callback set by @ref hs_set_scratch_allocator() or @ref hs_set_allocator()
+ * will be used by this function.
+ *
+ * @param src
+ *      The existing @ref hs_scratch_t to be cloned.
+ *
+ * @param dest
+ *      A pointer to the new scratch space will be returned here.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success; @ref HS_NOMEM if the allocation fails.
+ *      Other errors may be returned if invalid parameters are specified.
+ */
+hs_error_t HS_CDECL hs_clone_scratch(const hs_scratch_t *src,
+                                     hs_scratch_t **dest);
+
+/**
+ * Like @ref hs_clone_scratch() but writes to a preallocated buffer.
+ *
+ * @param src
+ *      The existing @ref hs_scratch_t to be cloned.
+ *
+ * @param dest
+ *      A pointer where scratch space should be initialized.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success;
+ *      @ref HS_INVALID if dest is NULL or badly aligned.
+ */
+hs_error_t HS_CDECL hs_init_scratch(const hs_scratch_t *src, hs_scratch_t *dest);
+
+/**
+ * Provides the size of the given scratch space.
+ *
+ * @param scratch
+ *      A per-thread scratch space allocated by @ref hs_alloc_scratch() or @ref
+ *      hs_clone_scratch().
+ *
+ * @param scratch_size
+ *      On success, the size of the scratch space in bytes is placed in this
+ *      parameter.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t HS_CDECL hs_scratch_size(const hs_scratch_t *scratch,
+                                    size_t *scratch_size);
+
+/**
+ * Free a scratch block previously allocated by @ref hs_alloc_scratch() or @ref
+ * hs_clone_scratch().
+ *
+ * The free callback set by @ref hs_set_scratch_allocator() or @ref
+ * hs_set_allocator() will be used by this function.
+ *
+ * @param scratch
+ *      The scratch block to be freed. NULL may also be safely provided.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t HS_CDECL hs_free_scratch(hs_scratch_t *scratch);
+
+/**
+ * Callback 'from' return value, indicating that the start of this match was
+ * too early to be tracked with the requested SOM_HORIZON precision.
+ */
+#define HS_OFFSET_PAST_HORIZON    (~0ULL)
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* HS_RUNTIME_H_ */
diff --git a/regex/hs_version.c b/regex/hs_version.c
new file mode 100644
index 000000000..04cf46f3f
--- /dev/null
+++ b/regex/hs_version.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "ue2common.h"
+#include "hs_common.h"
+#include "hs_version.h"
+
+HS_PUBLIC_API
+const char * HS_CDECL hs_version(void) {
+    return HS_VERSION_STRING;
+}
diff --git a/regex/hwlm/hwlm.c b/regex/hwlm/hwlm.c
new file mode 100644
index 000000000..24aa26a4c
--- /dev/null
+++ b/regex/hwlm/hwlm.c
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Hamster Wheel Literal Matcher: runtime.
+ */
+#include "hwlm.h"
+#include "hwlm_internal.h"
+#include "noodle_engine.h"
+#include "scratch.h"
+#include "ue2common.h"
+#include "fdr/fdr.h"
+#include "nfa/accel.h"
+#include "nfa/shufti.h"
+#include "nfa/truffle.h"
+#include "nfa/vermicelli.h"
+#ifndef __KERNEL__
+#include <string.h>
+#else
+#include <linux/string.h>
+#endif
+
+#define MIN_ACCEL_LEN_BLOCK  16
+#define MIN_ACCEL_LEN_STREAM 16
+
+static really_inline
+const u8 *run_hwlm_accel(const union AccelAux *aux, const u8 *ptr,
+                         const u8 *end) {
+    switch (aux->accel_type) {
+    case ACCEL_VERM:
+        DEBUG_PRINTF("single vermicelli for 0x%02hhx\n", aux->verm.c);
+        return vermicelliExec(aux->verm.c, 0, ptr, end);
+    case ACCEL_VERM_NOCASE:
+        DEBUG_PRINTF("single vermicelli-nocase for 0x%02hhx\n", aux->verm.c);
+        return vermicelliExec(aux->verm.c, 1, ptr, end);
+    case ACCEL_DVERM:
+        DEBUG_PRINTF("double vermicelli for 0x%02hhx%02hhx\n", aux->dverm.c1,
+                     aux->dverm.c2);
+        return vermicelliDoubleExec(aux->dverm.c1, aux->dverm.c2, 0, ptr, end);
+    case ACCEL_DVERM_NOCASE:
+        DEBUG_PRINTF("double vermicelli-nocase for 0x%02hhx%02hhx\n",
+                     aux->dverm.c1, aux->dverm.c2);
+        return vermicelliDoubleExec(aux->dverm.c1, aux->dverm.c2, 1, ptr, end);
+    case ACCEL_SHUFTI:
+        DEBUG_PRINTF("single shufti\n");
+        return shuftiExec(aux->shufti.lo, aux->shufti.hi, ptr, end);
+    case ACCEL_TRUFFLE:
+        DEBUG_PRINTF("truffle\n");
+        return truffleExec(aux->truffle.mask1, aux->truffle.mask2, ptr, end);
+    default:
+        /* no acceleration, fall through and return current ptr */
+        DEBUG_PRINTF("no accel; %u\n", (int)aux->accel_type);
+        assert(aux->accel_type == ACCEL_NONE);
+        return ptr;
+    }
+}
+
+static really_inline
+void do_accel_block(const union AccelAux *aux, const u8 *buf, size_t len,
+                    size_t *start) {
+
+    if (len - *start < MIN_ACCEL_LEN_BLOCK) {
+        return;
+    }
+
+    const u8 *ptr = buf + *start;
+    const u8 *end = buf + len;
+    const u8 offset = aux->generic.offset;
+    ptr = run_hwlm_accel(aux, ptr, end);
+
+    if (offset) {
+        ptr -= offset;
+        if (ptr < buf) {
+            ptr = buf;
+        }
+    }
+    assert(ptr >= buf);
+    *start = ptr - buf;
+}
+
+static really_inline
+int inaccurate_accel(u8 type) {
+    /* accels which don't always catch up to the boundary
+     * DSHUFTI is also inaccurate but it is not used by the hamsters */
+    return type == ACCEL_DVERM_NOCASE || type == ACCEL_DVERM;
+}
+
+static never_inline
+void do_accel_streaming(const union AccelAux *aux, const u8 *hbuf, size_t hlen,
+                        const u8 *buf, size_t len, size_t *start) {
+    if (aux->accel_type == ACCEL_NONE || len - *start < MIN_ACCEL_LEN_STREAM) {
+        return;
+    }
+
+    const u8 offset = aux->generic.offset;
+
+    DEBUG_PRINTF("using accel %hhu offset %hhu\n", aux->accel_type, offset);
+
+    // Scan history buffer, but only if the start offset (which always refers to
+    // buf) is zero.
+
+    if (!*start && hlen) {
+        const u8 *ptr1 = hbuf;
+        const u8 *end1 = hbuf + hlen;
+        if (hlen >= 16) {
+            ptr1 = run_hwlm_accel(aux, ptr1, end1);
+        }
+
+        if ((hlen <= 16 || inaccurate_accel(aux->accel_type))
+            && end1 != ptr1 && end1 - ptr1 <= 16) {
+            DEBUG_PRINTF("already scanned %zu/%zu\n", ptr1 - hbuf, hlen);
+            /* see if we can finish off the history buffer completely */
+            u8 ALIGN_DIRECTIVE temp[17];
+            ptrdiff_t tlen = end1 - ptr1;
+            memcpy(temp, ptr1, tlen);
+            memset(temp + tlen, 0, 17 - tlen);
+            if (len) { /* for dverm */
+                temp[end1 - ptr1] = *buf;
+            }
+
+            const u8 *tempp = run_hwlm_accel(aux, temp, temp + 17);
+
+            if (tempp - temp >= tlen) {
+                ptr1 = end1;
+            }
+            DEBUG_PRINTF("got %zu\n", tempp - temp);
+        }
+
+        if (ptr1 != end1) {
+            DEBUG_PRINTF("bailing in history\n");
+            return;
+        }
+    }
+
+    DEBUG_PRINTF("scanning main buffer, start=%zu, len=%zu\n", *start, len);
+
+    const u8 *ptr2 = buf + *start;
+    const u8 *end2 = buf + len;
+
+    const u8 *found = run_hwlm_accel(aux, ptr2, end2);
+
+    if (found >= ptr2 + offset) {
+        size_t delta = found - offset - ptr2;
+        DEBUG_PRINTF("got %zu/%zu in 2nd buffer\n", delta, len);
+        *start += delta;
+    } else if (hlen) {
+        UNUSED size_t remaining = offset + ptr2 - found;
+        DEBUG_PRINTF("got %zu/%zu remaining in 1st buffer\n", remaining, hlen);
+    }
+}
+
+hwlm_error_t hwlmExec(const struct HWLM *t, const u8 *buf, size_t len,
+                      size_t start, HWLMCallback cb, struct hs_scratch *scratch,
+                      hwlm_group_t groups) {
+    assert(t);
+
+    DEBUG_PRINTF("buf len=%zu, start=%zu, groups=%llx\n", len, start, groups);
+    if (!groups) {
+        DEBUG_PRINTF("groups all off\n");
+        return HWLM_SUCCESS;
+    }
+
+    assert(start < len);
+
+    if (t->type == HWLM_ENGINE_NOOD) {
+        DEBUG_PRINTF("calling noodExec\n");
+        return noodExec(HWLM_C_DATA(t), buf, len, start, cb, scratch);
+    }
+
+    assert(t->type == HWLM_ENGINE_FDR);
+    const union AccelAux *aa = &t->accel0;
+    if ((groups & ~t->accel1_groups) == 0) {
+        DEBUG_PRINTF("using hq accel %hhu\n", t->accel1.accel_type);
+        aa = &t->accel1;
+    }
+    do_accel_block(aa, buf, len, &start);
+    DEBUG_PRINTF("calling frankie (groups=%08llx, start=%zu)\n", groups, start);
+    return fdrExec(HWLM_C_DATA(t), buf, len, start, cb, scratch, groups);
+}
+
+hwlm_error_t hwlmExecStreaming(const struct HWLM *t, size_t len, size_t start,
+                               HWLMCallback cb, struct hs_scratch *scratch,
+                               hwlm_group_t groups) {
+    assert(t);
+    assert(scratch);
+
+    const u8 *hbuf = scratch->core_info.hbuf;
+    const size_t hlen = scratch->core_info.hlen;
+    const u8 *buf = scratch->core_info.buf;
+
+    DEBUG_PRINTF("hbuf len=%zu, buf len=%zu, start=%zu, groups=%llx\n", hlen,
+                 len, start, groups);
+
+    if (!groups) {
+        return HWLM_SUCCESS;
+    }
+
+    assert(start < len);
+
+    if (t->type == HWLM_ENGINE_NOOD) {
+        DEBUG_PRINTF("calling noodExec\n");
+        // If we've been handed a start offset, we can use a block mode scan at
+        // that offset.
+        if (start) {
+            return noodExec(HWLM_C_DATA(t), buf, len, start, cb, scratch);
+        } else {
+            return noodExecStreaming(HWLM_C_DATA(t), hbuf, hlen, buf, len, cb,
+                                     scratch);
+        }
+    }
+
+    assert(t->type == HWLM_ENGINE_FDR);
+    const union AccelAux *aa = &t->accel0;
+    if ((groups & ~t->accel1_groups) == 0) {
+        DEBUG_PRINTF("using hq accel %hhu\n", t->accel1.accel_type);
+        aa = &t->accel1;
+    }
+    do_accel_streaming(aa, hbuf, hlen, buf, len, &start);
+    DEBUG_PRINTF("calling frankie (groups=%08llx, start=%zu)\n", groups, start);
+    return fdrExecStreaming(HWLM_C_DATA(t), hbuf, hlen, buf, len, start, cb,
+                            scratch, groups);
+}
diff --git a/regex/hwlm/hwlm.h b/regex/hwlm/hwlm.h
new file mode 100644
index 000000000..224ecf6bf
--- /dev/null
+++ b/regex/hwlm/hwlm.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Hamster Wheel Literal Matcher: runtime API.
+ */
+
+#ifndef HWLM_H
+#define HWLM_H
+
+#include "ue2common.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/** \brief Error return type for exec functions. */
+typedef int hwlm_error_t;
+
+/** \brief Type representing a set of groups as a bitmap. */
+typedef u64a hwlm_group_t;
+
+/** \brief HWLM callback return type. */
+typedef hwlm_group_t hwlmcb_rv_t;
+
+/** \brief Value representing all possible literal groups. */
+#define HWLM_ALL_GROUPS         ((hwlm_group_t)~0ULL)
+
+/** \brief Callback return value indicating that we should continue matching. */
+#define HWLM_CONTINUE_MATCHING  HWLM_ALL_GROUPS
+
+/** \brief Callback return value indicating that we should halt matching. */
+#define HWLM_TERMINATE_MATCHING 0
+
+/** \brief Matching finished without being terminated by the user. */
+#define HWLM_SUCCESS       0
+
+/** \brief The user terminated matching by returning HWLM_TERMINATE_MATCHING
+ * from the match callback. */
+#define HWLM_TERMINATED    1
+
+/** \brief An error occurred during matching.
+ *
+ * This should only be used if an unsupported engine was called (like one
+ * designed for a different architecture). */
+#define HWLM_ERROR_UNKNOWN 2
+
+/** \brief Max length of the literal passed to HWLM. */
+#define HWLM_LITERAL_MAX_LEN 8
+
+struct hs_scratch;
+struct HWLM;
+
+/** \brief The type for an HWLM callback.
+ *
+ * This callback receives an end-of-match offset, the ID of the match and
+ * the context pointer that was passed into \ref hwlmExec or
+ * \ref hwlmExecStreaming.
+ *
+ * A callback return of \ref HWLM_TERMINATE_MATCHING will stop matching.
+ *
+ * A callback return of \ref HWLM_CONTINUE_MATCHING continues matching.
+ *
+ * An arbitrary group mask may be given as the return value. This will be taken
+ * as a hint by the underlying engine that only literals with groups
+ * overlapping the provided mask need to be reported.
+ *
+ * The underlying engine may choose not to report a match if there is no group
+ * belonging to the literal which was active at the when the end match location
+ * was first reached.
+ */
+typedef hwlmcb_rv_t (*HWLMCallback)(size_t end, u32 id,
+                     struct hs_scratch *scratch);
+
+/** \brief Match strings in table.
+ *
+ * If a match occurs, the callback function given will be called with the index
+ * of the last character in the string and the \p context (passed through
+ * without interpretation).
+ *
+ * Returns \ref HWLM_TERMINATED if scanning is cancelled due to the callback
+ * returning \ref HWLM_TERMINATE_MATCHING.
+ *
+ * \p start is the first offset at which a match may start. Note: match
+ * starts may include masks overhanging the main literal.
+ *
+ * The underlying engine may choose not to report any match which starts before
+ * the first possible match of a literal which is in the initial group mask.
+ */
+hwlm_error_t hwlmExec(const struct HWLM *tab, const u8 *buf, size_t len,
+                      size_t start, HWLMCallback callback,
+                      struct hs_scratch *scratch, hwlm_group_t groups);
+
+/** \brief As for \ref hwlmExec, but a streaming case across two buffers.
+ *
+ * \p len is the length of the main buffer to be scanned.
+ *
+ * \p start is an advisory hint representing the first offset at which a match
+ * may start. Some underlying literal matches may not respect it. Note: match
+ * starts may include masks overhanging the main literal.
+ *
+ * \p scratch is used to access the history buffer, history length and
+ * the main buffer.
+ *
+ * Two buffers/lengths are provided. Matches that occur entirely within
+ * the history buffer will not be reported by this function. The offsets
+ * reported for the main buffer are relative to the start of that buffer (a
+ * match at byte 10 of the main buffer is reported as 10). Matches that start
+ * in the history buffer will have starts reported with 'negative' values.
+ */
+hwlm_error_t hwlmExecStreaming(const struct HWLM *tab, size_t len, size_t start,
+                               HWLMCallback callback,
+                               struct hs_scratch *scratch, hwlm_group_t groups);
+
+#ifdef __cplusplus
+}       /* extern "C" */
+#endif
+
+#endif
diff --git a/regex/hwlm/hwlm_internal.h b/regex/hwlm/hwlm_internal.h
new file mode 100644
index 000000000..e35c84fdf
--- /dev/null
+++ b/regex/hwlm/hwlm_internal.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Hamster Wheel Literal Matcher: data structures.
+ */
+
+#ifndef HWLM_INTERNAL_H
+#define HWLM_INTERNAL_H
+
+#include "hwlm.h"
+#include "ue2common.h"
+#include "nfa/accel.h"
+
+/** \brief Underlying engine is FDR. */
+#define HWLM_ENGINE_FDR     12
+
+/** \brief Underlying engine is Noodle. */
+#define HWLM_ENGINE_NOOD    16
+
+/** \brief Main Hamster Wheel Literal Matcher header. Followed by
+ * engine-specific structure. */
+struct HWLM {
+    u8 type; /**< HWLM_ENGINE_NOOD or HWLM_ENGINE_FDR */
+    hwlm_group_t accel1_groups; /**< accelerable groups. */
+    union AccelAux accel1; /**< used if group mask is subset of accel1_groups */
+    union AccelAux accel0; /**< fallback accel scheme */
+};
+
+/** \brief Fetch a const pointer to the underlying engine. */
+#define HWLM_C_DATA(p) ((const void *)((const char *)(p)                  \
+                                       + ROUNDUP_CL(sizeof(struct HWLM))))
+
+/** \brief Fetch a pointer to the underlying engine. */
+#define HWLM_DATA(p) ((void *)((char *)(p) + ROUNDUP_CL(sizeof(struct HWLM))))
+
+#endif
diff --git a/regex/hwlm/noodle_engine.c b/regex/hwlm/noodle_engine.c
new file mode 100644
index 000000000..0af2cb6f0
--- /dev/null
+++ b/regex/hwlm/noodle_engine.c
@@ -0,0 +1,447 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Noodle literal matcher: runtime.
+ */
+#include "hwlm.h"
+#include "noodle_engine.h"
+#include "noodle_internal.h"
+#include "scratch.h"
+#include "ue2common.h"
+#include "util/arch.h"
+#include "util/bitutils.h"
+#include "util/compare.h"
+#include "util/intrinsics.h"
+#include "util/join.h"
+#include "util/masked_move.h"
+#include "util/partial_store.h"
+#include "util/simd_utils.h"
+
+#ifndef __KERNEL__
+#include <ctype.h>
+#include <stdbool.h>
+#include <string.h>
+#else
+#include <linux/types.h>
+#include <linux/string.h>
+#endif
+
+/** \brief Noodle runtime context. */
+struct cb_info {
+    HWLMCallback cb; //!< callback function called on match
+    u32 id; //!< ID to pass to callback on match
+    struct hs_scratch *scratch; //!< scratch to pass to callback
+    size_t offsetAdj; //!< used in streaming mode
+};
+
+#if defined(HAVE_AVX512)
+#define CHUNKSIZE 64
+#define MASK_TYPE m512
+#define Z_BITS 64
+#define Z_TYPE u64a
+#elif defined(HAVE_AVX2)
+#define CHUNKSIZE 32
+#define MASK_TYPE m256
+#define Z_BITS 32
+#define Z_TYPE u32
+#else
+#define CHUNKSIZE 16
+#define MASK_TYPE m128
+#define Z_BITS 32
+#define Z_TYPE u32
+#endif
+
+
+#define RETURN_IF_TERMINATED(x)                                                \
+    {                                                                          \
+        if ((x) == HWLM_TERMINATED) {                                          \
+            return HWLM_TERMINATED;                                            \
+        }                                                                      \
+    }
+
+#define SINGLE_ZSCAN()                                                         \
+    do {                                                                       \
+        while (unlikely(z)) {                                                  \
+            Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z);                   \
+            size_t matchPos = d - buf + pos;                                   \
+            DEBUG_PRINTF("match pos %zu\n", matchPos);                         \
+            hwlmcb_rv_t rv = final(n, buf, len, 1, cbi, matchPos);             \
+            RETURN_IF_TERMINATED(rv);                                          \
+        }                                                                      \
+    } while (0)
+
+#define DOUBLE_ZSCAN()                                                         \
+    do {                                                                       \
+        while (unlikely(z)) {                                                  \
+            Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z);                   \
+            size_t matchPos = d - buf + pos - 1;                               \
+            DEBUG_PRINTF("match pos %zu\n", matchPos);                         \
+            hwlmcb_rv_t rv = final(n, buf, len, 0, cbi, matchPos);             \
+            RETURN_IF_TERMINATED(rv);                                          \
+        }                                                                      \
+    } while (0)
+
+static really_inline
+u8 caseClear8(u8 x, bool noCase) {
+    return (u8)(noCase ? (x & (u8)0xdf) : x);
+}
+
+// Make sure the rest of the string is there. The single character scanner
+// is used only for single chars with case insensitivity used correctly,
+// so it can go straight to the callback if we get this far.
+static really_inline
+hwlm_error_t final(const struct noodTable *n, const u8 *buf, UNUSED size_t len,
+                   char single, const struct cb_info *cbi, size_t pos) {
+    if (single) {
+        if (n->msk_len == 1) {
+            goto match;
+        }
+    }
+    assert(len >= n->msk_len);
+    u64a v =
+        partial_load_u64a(buf + pos + n->key_offset - n->msk_len, n->msk_len);
+    DEBUG_PRINTF("v %016llx msk %016llx cmp %016llx\n", v, n->msk, n->cmp);
+    if ((v & n->msk) != n->cmp) {
+        /* mask didn't match */
+        return HWLM_SUCCESS;
+    }
+
+match:
+    pos -= cbi->offsetAdj;
+    DEBUG_PRINTF("match @ %zu\n", pos + n->key_offset);
+    hwlmcb_rv_t rv = cbi->cb(pos + n->key_offset - 1, cbi->id, cbi->scratch);
+    if (rv == HWLM_TERMINATE_MATCHING) {
+        return HWLM_TERMINATED;
+    }
+    return HWLM_SUCCESS;
+}
+
+#if defined(HAVE_AVX512)
+#define CHUNKSIZE 64
+#define MASK_TYPE m512
+#include "noodle_engine_avx512.c"
+#elif defined(HAVE_AVX2)
+#define CHUNKSIZE 32
+#define MASK_TYPE m256
+#include "noodle_engine_avx2.c"
+#else
+#define CHUNKSIZE 16
+#define MASK_TYPE m128
+#include "noodle_engine_sse.c"
+#endif
+
+static really_inline
+hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf,
+                            size_t len, size_t start, bool noCase,
+                            const struct cb_info *cbi) {
+
+    const MASK_TYPE mask1 = getMask(n->key0, noCase);
+    const MASK_TYPE caseMask = getCaseMask();
+
+    size_t offset = start + n->msk_len - 1;
+    size_t end = len;
+    assert(offset < end);
+
+#if !defined(HAVE_AVX512)
+    hwlm_error_t rv;
+
+    if (end - offset < CHUNKSIZE) {
+        rv = scanSingleShort(n, buf, len, noCase, caseMask, mask1, cbi, offset,
+                             end);
+        return rv;
+    }
+
+    if (end - offset == CHUNKSIZE) {
+        rv = scanSingleUnaligned(n, buf, len, offset, noCase, caseMask, mask1,
+                                 cbi, offset, end);
+        return rv;
+    }
+
+    uintptr_t data = (uintptr_t)buf;
+    uintptr_t s2Start = ROUNDUP_N(data + offset, CHUNKSIZE) - data;
+    uintptr_t last = data + end;
+    uintptr_t s2End = ROUNDDOWN_N(last, CHUNKSIZE) - data;
+    uintptr_t s3Start = end - CHUNKSIZE;
+
+    if (offset != s2Start) {
+        // first scan out to the fast scan starting point
+        DEBUG_PRINTF("stage 1: -> %zu\n", s2Start);
+        rv = scanSingleUnaligned(n, buf, len, offset, noCase, caseMask, mask1,
+                                 cbi, offset, s2Start);
+        RETURN_IF_TERMINATED(rv);
+    }
+
+    if (likely(s2Start != s2End)) {
+        // scan as far as we can, bounded by the last point this key can
+        // possibly match
+        DEBUG_PRINTF("fast: ~ %zu -> %zu\n", s2Start, s2End);
+        rv = scanSingleFast(n, buf, len, noCase, caseMask, mask1, cbi, s2Start,
+                            s2End);
+        RETURN_IF_TERMINATED(rv);
+    }
+
+    // if we are done bail out
+    if (s2End == len) {
+        return HWLM_SUCCESS;
+    }
+
+    DEBUG_PRINTF("stage 3: %zu -> %zu\n", s2End, len);
+    rv = scanSingleUnaligned(n, buf, len, s3Start, noCase, caseMask, mask1, cbi,
+                             s2End, len);
+
+    return rv;
+#else // HAVE_AVX512
+    return scanSingle512(n, buf, len, noCase, caseMask, mask1, cbi, offset,
+                         end);
+#endif
+}
+
+static really_inline
+hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf,
+                            size_t len, size_t start, bool noCase,
+                            const struct cb_info *cbi) {
+    // we stop scanning for the key-fragment when the rest of the key can't
+    // possibly fit in the remaining buffer
+    size_t end = len - n->key_offset + 2;
+
+    // the first place the key can match
+    size_t offset = start + n->msk_len - n->key_offset;
+
+    const MASK_TYPE caseMask = getCaseMask();
+    const MASK_TYPE mask1 = getMask(n->key0, noCase);
+    const MASK_TYPE mask2 = getMask(n->key1, noCase);
+
+#if !defined(HAVE_AVX512)
+    hwlm_error_t rv;
+
+    if (end - offset < CHUNKSIZE) {
+        rv = scanDoubleShort(n, buf, len, noCase, caseMask, mask1, mask2, cbi,
+                             offset, end);
+        return rv;
+    }
+    if (end - offset == CHUNKSIZE) {
+        rv = scanDoubleUnaligned(n, buf, len, offset, noCase, caseMask, mask1,
+                                 mask2, cbi, offset, end);
+        return rv;
+    }
+
+    uintptr_t data = (uintptr_t)buf;
+    uintptr_t s2Start = ROUNDUP_N(data + offset, CHUNKSIZE) - data;
+    uintptr_t s1End = s2Start + 1;
+    uintptr_t last = data + end;
+    uintptr_t s2End = ROUNDDOWN_N(last, CHUNKSIZE) - data;
+    uintptr_t s3Start = end - CHUNKSIZE;
+    uintptr_t off = offset;
+
+    if (s2Start != off) {
+        // first scan out to the fast scan starting point plus one char past to
+        // catch the key on the overlap
+        DEBUG_PRINTF("stage 1: %zu -> %zu\n", off, s2Start);
+        rv = scanDoubleUnaligned(n, buf, len, offset, noCase, caseMask, mask1,
+                                 mask2, cbi, off, s1End);
+        RETURN_IF_TERMINATED(rv);
+    }
+    off = s1End;
+
+    if (s2Start >= end) {
+        DEBUG_PRINTF("s2 == mL %zu\n", end);
+        return HWLM_SUCCESS;
+    }
+
+    if (likely(s2Start != s2End)) {
+        // scan as far as we can, bounded by the last point this key can
+        // possibly match
+        DEBUG_PRINTF("fast: ~ %zu -> %zu\n", s2Start, s3Start);
+        rv = scanDoubleFast(n, buf, len, noCase, caseMask, mask1, mask2, cbi,
+                            s2Start, s2End);
+        RETURN_IF_TERMINATED(rv);
+        off = s2End;
+    }
+
+    // if there isn't enough data left to match the key, bail out
+    if (s2End == end) {
+        return HWLM_SUCCESS;
+    }
+
+    DEBUG_PRINTF("stage 3: %zu -> %zu\n", s3Start, end);
+    rv = scanDoubleUnaligned(n, buf, len, s3Start, noCase, caseMask, mask1,
+                             mask2, cbi, off, end);
+
+    return rv;
+#else // AVX512
+    return scanDouble512(n, buf, len, noCase, caseMask, mask1, mask2, cbi,
+                         offset, end);
+#endif // AVX512
+}
+
+
+static really_inline
+hwlm_error_t scanSingleNoCase(const struct noodTable *n, const u8 *buf,
+                              size_t len, size_t start,
+                              const struct cb_info *cbi) {
+    return scanSingleMain(n, buf, len, start, 1, cbi);
+}
+
+static really_inline
+hwlm_error_t scanSingleCase(const struct noodTable *n, const u8 *buf,
+                            size_t len, size_t start,
+                            const struct cb_info *cbi) {
+    return scanSingleMain(n, buf, len, start, 0, cbi);
+}
+
+// Single-character specialisation, used when keyLen = 1
+static really_inline
+hwlm_error_t scanSingle(const struct noodTable *n, const u8 *buf, size_t len,
+                        size_t start, bool noCase, const struct cb_info *cbi) {
+    if (!ourisalpha(n->key0)) {
+        noCase = 0; // force noCase off if we don't have an alphabetic char
+    }
+
+    // kinda ugly, but this forces constant propagation
+    if (noCase) {
+        return scanSingleNoCase(n, buf, len, start, cbi);
+    } else {
+        return scanSingleCase(n, buf, len, start, cbi);
+    }
+}
+
+
+static really_inline
+hwlm_error_t scanDoubleNoCase(const struct noodTable *n, const u8 *buf,
+                              size_t len, size_t start,
+                              const struct cb_info *cbi) {
+    return scanDoubleMain(n, buf, len, start, 1, cbi);
+}
+
+static really_inline
+hwlm_error_t scanDoubleCase(const struct noodTable *n, const u8 *buf,
+                            size_t len, size_t start,
+                            const struct cb_info *cbi) {
+    return scanDoubleMain(n, buf, len, start, 0, cbi);
+}
+
+
+static really_inline
+hwlm_error_t scanDouble(const struct noodTable *n, const u8 *buf, size_t len,
+                        size_t start, bool noCase, const struct cb_info *cbi) {
+    // kinda ugly, but this forces constant propagation
+    if (noCase) {
+        return scanDoubleNoCase(n, buf, len, start, cbi);
+    } else {
+        return scanDoubleCase(n, buf, len, start, cbi);
+    }
+}
+
+// main entry point for the scan code
+static really_inline
+hwlm_error_t scan(const struct noodTable *n, const u8 *buf, size_t len,
+                  size_t start, char single, bool noCase,
+                  const struct cb_info *cbi) {
+    if (len - start < n->msk_len) {
+        // can't find string of length keyLen in a shorter buffer
+        return HWLM_SUCCESS;
+    }
+
+    if (single) {
+        return scanSingle(n, buf, len, start, noCase, cbi);
+    } else {
+        return scanDouble(n, buf, len, start, noCase, cbi);
+    }
+}
+
+/** \brief Block-mode scanner. */
+hwlm_error_t noodExec(const struct noodTable *n, const u8 *buf, size_t len,
+                      size_t start, HWLMCallback cb,
+                      struct hs_scratch *scratch) {
+    assert(n && buf);
+
+    struct cb_info cbi = {cb, n->id, scratch, 0};
+    DEBUG_PRINTF("nood scan of %zu bytes for %*s @ %p\n", len, n->msk_len,
+                 (const char *)&n->cmp, buf);
+
+    return scan(n, buf, len, start, n->single, n->nocase, &cbi);
+}
+
+/** \brief Streaming-mode scanner. */
+hwlm_error_t noodExecStreaming(const struct noodTable *n, const u8 *hbuf,
+                               size_t hlen, const u8 *buf, size_t len,
+                               HWLMCallback cb, struct hs_scratch *scratch) {
+    assert(n);
+
+    if (len + hlen < n->msk_len) {
+        DEBUG_PRINTF("not enough bytes for a match\n");
+        return HWLM_SUCCESS;
+    }
+
+    struct cb_info cbi = {cb, n->id, scratch, 0};
+    DEBUG_PRINTF("nood scan of %zu bytes (%zu hlen) for %*s @ %p\n", len, hlen,
+                 n->msk_len, (const char *)&n->cmp, buf);
+
+    if (hlen && n->msk_len > 1) {
+        /*
+         * we have history, so build up a buffer from enough of the history
+         * buffer plus what we've been given to scan. Since this is relatively
+         * short, just check against msk+cmp per byte offset for matches.
+         */
+        assert(hbuf);
+        u8 ALIGN_DIRECTIVE temp_buf[HWLM_LITERAL_MAX_LEN * 2];
+        memset(temp_buf, 0, sizeof(temp_buf));
+
+        assert(n->msk_len);
+        size_t tl1 = MIN((size_t)n->msk_len - 1, hlen);
+        size_t tl2 = MIN((size_t)n->msk_len - 1, len);
+
+        assert(tl1 + tl2 <= sizeof(temp_buf));
+        assert(tl1 + tl2 >= n->msk_len);
+        assert(tl1 <= sizeof(u64a));
+        assert(tl2 <= sizeof(u64a));
+        DEBUG_PRINTF("using %zu bytes of hist and %zu bytes of buf\n", tl1, tl2);
+
+        unaligned_store_u64a(temp_buf,
+                             partial_load_u64a(hbuf + hlen - tl1, tl1));
+        unaligned_store_u64a(temp_buf + tl1, partial_load_u64a(buf, tl2));
+
+        for (size_t i = 0; i <= tl1 + tl2 - n->msk_len; i++) {
+            u64a v = unaligned_load_u64a(temp_buf + i);
+            if ((v & n->msk) == n->cmp) {
+                size_t m_end = -tl1 + i + n->msk_len - 1;
+                DEBUG_PRINTF("match @ %zu (i %zu)\n", m_end, i);
+                hwlmcb_rv_t rv = cb(m_end, n->id, scratch);
+                if (rv == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATED;
+                }
+            }
+        }
+    }
+
+    assert(buf);
+
+    cbi.offsetAdj = 0;
+    return scan(n, buf, len, 0, n->single, n->nocase, &cbi);
+}
diff --git a/regex/hwlm/noodle_engine.h b/regex/hwlm/noodle_engine.h
new file mode 100644
index 000000000..64422c41f
--- /dev/null
+++ b/regex/hwlm/noodle_engine.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Noodle literal matcher: runtime API.
+ */
+
+#ifndef NOODLE_ENGINE_H
+#define NOODLE_ENGINE_H
+
+#include "hwlm.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+struct noodTable;
+struct hs_scratch;
+
+/** \brief Block-mode scanner. */
+hwlm_error_t noodExec(const struct noodTable *n, const u8 *buf, size_t len,
+                      size_t start, HWLMCallback cb,
+                      struct hs_scratch *scratch);
+
+/** \brief Streaming-mode scanner. */
+hwlm_error_t noodExecStreaming(const struct noodTable *n, const u8 *hbuf,
+                               size_t hlen, const u8 *buf, size_t len,
+                               HWLMCallback cb, struct hs_scratch *scratch);
+
+#ifdef __cplusplus
+}       /* extern "C" */
+#endif
+
+#endif
diff --git a/regex/hwlm/noodle_engine_avx2.c b/regex/hwlm/noodle_engine_avx2.c
new file mode 100644
index 000000000..2a42a3c0e
--- /dev/null
+++ b/regex/hwlm/noodle_engine_avx2.c
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* noodle scan parts for AVX */
+
+static really_inline m256 getMask(u8 c, bool noCase) {
+    u8 k = caseClear8(c, noCase);
+    return set32x8(k);
+}
+
+static really_inline m256 getCaseMask(void) {
+    return set32x8(0xdf);
+}
+
+static really_inline
+hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf,
+                                 size_t len, size_t offset, bool noCase,
+                                 m256 caseMask, m256 mask1,
+                                 const struct cb_info *cbi, size_t start,
+                                 size_t end) {
+    const u8 *d = buf + offset;
+    DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset);
+    const size_t l = end - start;
+
+    m256 v = loadu256(d);
+
+    if (noCase) {
+        v = and256(v, caseMask);
+    }
+
+    u32 z = movemask256(eq256(mask1, v));
+
+    u32 buf_off = start - offset;
+    u32 mask = (u32)((u64a)(1ULL << l) - 1) << buf_off;
+    DEBUG_PRINTF("mask 0x%08x z 0x%08x\n", mask, z);
+
+    z &= mask;
+
+    SINGLE_ZSCAN();
+
+    return HWLM_SUCCESS;
+}
+
+static really_inline
+hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf,
+                                 size_t len, size_t offset, bool noCase,
+                                 m256 caseMask, m256 mask1, m256 mask2,
+                                 const struct cb_info *cbi, size_t start,
+                                 size_t end) {
+    const u8 *d = buf + offset;
+    DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset);
+    size_t l = end - start;
+
+    m256 v = loadu256(d);
+
+    if (noCase) {
+        v = and256(v, caseMask);
+    }
+
+    u32 z0 = movemask256(eq256(mask1, v));
+    u32 z1 = movemask256(eq256(mask2, v));
+    u32 z = (z0 << 1) & z1;
+
+    // mask out where we can't match
+    u32 buf_off = start - offset;
+    u32 mask = (u32)((u64a)(1ULL << l) - 1) << buf_off;
+    DEBUG_PRINTF("mask 0x%08x z 0x%08x\n", mask, z);
+    z &= mask;
+
+    DOUBLE_ZSCAN();
+
+    return HWLM_SUCCESS;
+}
+
+// The short scan routine. It is used both to scan data up to an
+// alignment boundary if needed and to finish off data that the aligned scan
+// function can't handle (due to small/unaligned chunk at end)
+static really_inline
+hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf,
+                             size_t len, bool noCase, m256 caseMask, m256 mask1,
+                             const struct cb_info *cbi, size_t start,
+                             size_t end) {
+    const u8 *d = buf + start;
+    size_t l = end - start;
+    DEBUG_PRINTF("l %zu\n", l);
+    assert(l <= 32);
+    if (!l) {
+        return HWLM_SUCCESS;
+    }
+
+    m256 v;
+    if (l < 4) {
+        u8 *vp = (u8*)&v;
+        switch (l) {
+            case 3:
+                vp[2] = d[2]; // fallthrough
+                fallthrough;
+            case 2:
+                vp[1] = d[1]; // fallthrough
+                fallthrough;
+            case 1:
+                vp[0] = d[0]; // fallthrough
+        }
+    } else {
+        v = masked_move256_len(d, l);
+    }
+
+    if (noCase) {
+        v = and256(v, caseMask);
+    }
+
+    // mask out where we can't match
+    u32 mask = (0xFFFFFFFF >> (32 - l));
+
+    u32 z = mask & movemask256(eq256(mask1, v));
+
+    SINGLE_ZSCAN();
+
+    return HWLM_SUCCESS;
+}
+
+static really_inline
+hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf,
+                             size_t len, bool noCase, m256 caseMask, m256 mask1,
+                             m256 mask2, const struct cb_info *cbi,
+                             size_t start, size_t end) {
+    const u8 *d = buf + start;
+    size_t l = end - start;
+    if (!l) {
+        return HWLM_SUCCESS;
+    }
+    assert(l <= 32);
+    m256 v;
+
+    DEBUG_PRINTF("d %zu\n", d - buf);
+
+    if (l < 4) {
+        u8 *vp = (u8*)&v;
+        switch (l) {
+            case 3:
+                vp[2] = d[2]; // fallthrough
+                fallthrough;
+            case 2:
+                vp[1] = d[1]; // fallthrough
+                fallthrough;
+            case 1:
+                vp[0] = d[0]; // fallthrough
+        }
+    } else {
+        v = masked_move256_len(d, l);
+    }
+
+    if (noCase) {
+        v = and256(v, caseMask);
+    }
+
+    u32 z0 = movemask256(eq256(mask1, v));
+    u32 z1 = movemask256(eq256(mask2, v));
+    u32 z = (z0 << 1) & z1;
+
+    // mask out where we can't match
+    u32 mask = (0xFFFFFFFF >> (32 - l));
+    z &= mask;
+
+    DOUBLE_ZSCAN();
+
+    return HWLM_SUCCESS;
+}
+
+static really_inline
+hwlm_error_t scanSingleFast(const struct noodTable *n, const u8 *buf,
+                            size_t len, bool noCase, m256 caseMask, m256 mask1,
+                            const struct cb_info *cbi, size_t start,
+                            size_t end) {
+    const u8 *d = buf + start, *e = buf + end;
+    assert(d < e);
+
+    for (; d < e; d += 32) {
+        m256 v = noCase ? and256(load256(d), caseMask) : load256(d);
+
+        u32 z = movemask256(eq256(mask1, v));
+
+        // On large packet buffers, this prefetch appears to get us about 2%.
+        __builtin_prefetch(d + 128);
+
+        SINGLE_ZSCAN();
+    }
+    return HWLM_SUCCESS;
+}
+
+static really_inline
+hwlm_error_t scanDoubleFast(const struct noodTable *n, const u8 *buf,
+                            size_t len, bool noCase, m256 caseMask, m256 mask1,
+                            m256 mask2, const struct cb_info *cbi, size_t start,
+                            size_t end) {
+    const u8 *d = buf + start, *e = buf + end;
+    DEBUG_PRINTF("start %zu end %zu \n", start, end);
+    assert(d < e);
+    u32 lastz0 = 0;
+
+    for (; d < e; d += 32) {
+        m256 v = noCase ? and256(load256(d), caseMask) : load256(d);
+
+        // we have to pull the masks out of the AVX registers because we can't
+        // byte shift between the lanes
+        u32 z0 = movemask256(eq256(mask1, v));
+        u32 z1 = movemask256(eq256(mask2, v));
+        u32 z = (lastz0 | (z0 << 1)) & z1;
+        lastz0 = z0 >> 31;
+
+        // On large packet buffers, this prefetch appears to get us about 2%.
+        __builtin_prefetch(d + 128);
+
+        DOUBLE_ZSCAN();
+
+    }
+    return HWLM_SUCCESS;
+}
+
diff --git a/regex/hwlm/noodle_engine_avx512.c b/regex/hwlm/noodle_engine_avx512.c
new file mode 100644
index 000000000..8cac1b15c
--- /dev/null
+++ b/regex/hwlm/noodle_engine_avx512.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* noodle scan parts for AVX512 */
+
+static really_inline
+m512 getMask(u8 c, bool noCase) {
+    u8 k = caseClear8(c, noCase);
+    return set64x8(k);
+}
+
+static really_inline
+m512 getCaseMask(void) {
+    return set64x8(CASE_CLEAR);
+}
+
+// The short scan routine. It is used both to scan data up to an
+// alignment boundary if needed and to finish off data that the aligned scan
+// function can't handle (due to small/unaligned chunk at end)
+static really_inline
+hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf,
+                             size_t len, bool noCase, m512 caseMask, m512 mask1,
+                             const struct cb_info *cbi, size_t start,
+                             size_t end) {
+    const u8 *d = buf + start;
+    ptrdiff_t scan_len = end - start;
+    DEBUG_PRINTF("scan_len %zu\n", scan_len);
+    assert(scan_len <= 64);
+    if (!scan_len) {
+        return HWLM_SUCCESS;
+    }
+
+    __mmask64 k = (~0ULL) >> (64 - scan_len);
+    DEBUG_PRINTF("load mask 0x%016llx\n", k);
+
+    m512 v = loadu_maskz_m512(k, d);
+
+    if (noCase) {
+        v = and512(v, caseMask);
+    }
+
+    // reuse the load mask to indicate valid bytes
+    u64a z = masked_eq512mask(k, mask1, v);
+
+    SINGLE_ZSCAN();
+
+    return HWLM_SUCCESS;
+}
+
+static really_inline
+hwlm_error_t scanSingle512(const struct noodTable *n, const u8 *buf, size_t len,
+                           bool noCase, m512 caseMask, m512 mask1,
+                           const struct cb_info *cbi, size_t start,
+                           size_t end) {
+    const u8 *d = buf + start;
+    const u8 *e = buf + end;
+    DEBUG_PRINTF("start %p end %p \n", d, e);
+    assert(d < e);
+    if (d + 64 >= e) {
+        goto tail;
+    }
+
+    // peel off first part to cacheline boundary
+    const u8 *d1 = ROUNDUP_PTR(d, 64);
+    if (scanSingleShort(n, buf, len, noCase, caseMask, mask1, cbi, start,
+                        d1 - buf) == HWLM_TERMINATED) {
+        return HWLM_TERMINATED;
+    }
+    d = d1;
+
+    for (; d + 64 < e; d += 64) {
+        DEBUG_PRINTF("d %p e %p \n", d, e);
+        m512 v = noCase ? and512(load512(d), caseMask) : load512(d);
+
+        u64a z = eq512mask(mask1, v);
+        __builtin_prefetch(d + 128);
+
+        SINGLE_ZSCAN();
+    }
+
+tail:
+    DEBUG_PRINTF("d %p e %p \n", d, e);
+    // finish off tail
+
+    return scanSingleShort(n, buf, len, noCase, caseMask, mask1, cbi, d - buf,
+                           e - buf);
+}
+
+static really_inline
+hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf,
+                             size_t len, bool noCase, m512 caseMask, m512 mask1,
+                             m512 mask2, const struct cb_info *cbi,
+                             u64a *lastz0, size_t start, size_t end) {
+    DEBUG_PRINTF("start %zu end %zu last 0x%016llx\n", start, end, *lastz0);
+    const u8 *d = buf + start;
+    ptrdiff_t scan_len = end - start;
+    if (!scan_len) {
+        return HWLM_SUCCESS;
+    }
+    assert(scan_len <= 64);
+    __mmask64 k = (~0ULL) >> (64 - scan_len);
+    DEBUG_PRINTF("load mask 0x%016llx scan_len %zu\n", k, scan_len);
+
+    m512 v = loadu_maskz_m512(k, d);
+    if (noCase) {
+        v = and512(v, caseMask);
+    }
+
+    u64a z0 = masked_eq512mask(k, mask1, v);
+    u64a z1 = masked_eq512mask(k, mask2, v);
+    u64a z = (*lastz0 | (z0 << 1)) & z1;
+    DEBUG_PRINTF("z 0x%016llx\n", z);
+
+    DOUBLE_ZSCAN();
+    *lastz0 = z0 >> (scan_len - 1);
+    return HWLM_SUCCESS;
+}
+
+static really_inline
+hwlm_error_t scanDouble512(const struct noodTable *n, const u8 *buf, size_t len,
+                           bool noCase, m512 caseMask, m512 mask1, m512 mask2,
+                           const struct cb_info *cbi, size_t start,
+                           size_t end) {
+    const u8 *d = buf + start;
+    const u8 *e = buf + end;
+    u64a lastz0 = 0;
+    DEBUG_PRINTF("start %zu end %zu \n", start, end);
+    assert(d < e);
+    if (d + 64 >= e) {
+        goto tail;
+    }
+
+    // peel off first part to cacheline boundary
+    const u8 *d1 = ROUNDUP_PTR(d, 64);
+    if (scanDoubleShort(n, buf, len, noCase, caseMask, mask1, mask2, cbi,
+                        &lastz0, start, d1 - buf) == HWLM_TERMINATED) {
+        return HWLM_TERMINATED;
+    }
+    d = d1;
+
+    for (; d + 64 < e; d += 64) {
+        DEBUG_PRINTF("d %p e %p 0x%016llx\n", d, e, lastz0);
+        m512 v = noCase ? and512(load512(d), caseMask) : load512(d);
+
+        /* we have to pull the masks out of the AVX registers because we can't
+           byte shift between the lanes */
+        u64a z0 = eq512mask(mask1, v);
+        u64a z1 = eq512mask(mask2, v);
+        u64a z = (lastz0 | (z0 << 1)) & z1;
+        lastz0 = z0 >> 63;
+
+        // On large packet buffers, this prefetch appears to get us about 2%.
+        __builtin_prefetch(d + 256);
+
+        DEBUG_PRINTF("z 0x%016llx\n", z);
+
+        DOUBLE_ZSCAN();
+    }
+
+tail:
+    DEBUG_PRINTF("d %p e %p off %zu \n", d, e, d - buf);
+    // finish off tail
+
+    return scanDoubleShort(n, buf, len, noCase, caseMask, mask1, mask2, cbi,
+                           &lastz0, d - buf, end);
+}
diff --git a/regex/hwlm/noodle_engine_sse.c b/regex/hwlm/noodle_engine_sse.c
new file mode 100644
index 000000000..7cd53d7ce
--- /dev/null
+++ b/regex/hwlm/noodle_engine_sse.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* noodle scan parts for SSE */
+
+static really_inline m128 getMask(u8 c, bool noCase) {
+    u8 k = caseClear8(c, noCase);
+    return set16x8(k);
+}
+
+static really_inline m128 getCaseMask(void) {
+    return set16x8(0xdf);
+}
+
+static really_inline
+hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf,
+                             size_t len, bool noCase, m128 caseMask, m128 mask1,
+                             const struct cb_info *cbi, size_t start,
+                             size_t end) {
+    const u8 *d = buf + start;
+    size_t l = end - start;
+    DEBUG_PRINTF("l %zu\n", l);
+    assert(l <= 16);
+    if (!l) {
+        return HWLM_SUCCESS;
+    }
+    m128 v = zeroes128();
+    // we don't have a clever way of doing this move yet
+    memcpy(&v, d, l);
+    if (noCase) {
+        v = and128(v, caseMask);
+    }
+
+    // mask out where we can't match
+    u32 mask = (0xFFFF >> (16 - l));
+
+    u32 z = mask & movemask128(eq128(mask1, v));
+
+    SINGLE_ZSCAN();
+
+    return HWLM_SUCCESS;
+}
+
+static really_inline
+hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf,
+                                 size_t len, size_t offset, bool noCase,
+                                 m128 caseMask, m128 mask1,
+                                 const struct cb_info *cbi, size_t start,
+                                 size_t end) {
+    const u8 *d = buf + offset;
+    DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset);
+    const size_t l = end - start;
+
+    m128 v = loadu128(d);
+
+    if (noCase) {
+        v = and128(v, caseMask);
+    }
+
+    u32 buf_off = start - offset;
+    u32 mask = ((1 << l) - 1) << buf_off;
+
+    u32 z = mask & movemask128(eq128(mask1, v));
+
+    DEBUG_PRINTF("mask 0x%08x z 0x%08x\n", mask, z);
+
+    z &= mask;
+
+    SINGLE_ZSCAN();
+
+    return HWLM_SUCCESS;
+}
+
+static really_inline
+hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf,
+                             size_t len, bool noCase, m128 caseMask, m128 mask1,
+                             m128 mask2, const struct cb_info *cbi,
+                             size_t start, size_t end) {
+    const u8 *d = buf + start;
+    size_t l = end - start;
+    if (!l) {
+        return HWLM_SUCCESS;
+    }
+    assert(l <= 32);
+
+    DEBUG_PRINTF("d %zu\n", d - buf);
+    m128 v = zeroes128();
+    memcpy(&v, d, l);
+    if (noCase) {
+        v = and128(v, caseMask);
+    }
+
+    u32 z = movemask128(and128(lshiftbyte_m128(eq128(mask1, v), 1),
+                               eq128(mask2, v)));
+
+    // mask out where we can't match
+    u32 mask = (0xFFFF >> (16 - l));
+    z &= mask;
+
+    DOUBLE_ZSCAN();
+
+    return HWLM_SUCCESS;
+}
+
+static really_inline
+hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf,
+                                 size_t len, size_t offset, bool noCase,
+                                 m128 caseMask, m128 mask1, m128 mask2,
+                                 const struct cb_info *cbi, size_t start,
+                                 size_t end) {
+    const u8 *d = buf + offset;
+    DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset);
+    size_t l = end - start;
+
+    m128 v = loadu128(d);
+
+    if (noCase) {
+        v = and128(v, caseMask);
+    }
+
+    u32 z = movemask128(and128(lshiftbyte_m128(eq128(mask1, v), 1),
+                               eq128(mask2, v)));
+
+    // mask out where we can't match
+    u32 buf_off = start - offset;
+    u32 mask = ((1 << l) - 1) << buf_off;
+    DEBUG_PRINTF("mask 0x%08x z 0x%08x\n", mask, z);
+    z &= mask;
+
+    DOUBLE_ZSCAN();
+
+    return HWLM_SUCCESS;
+}
+
+static really_inline
+hwlm_error_t scanSingleFast(const struct noodTable *n, const u8 *buf,
+                            size_t len, bool noCase, m128 caseMask, m128 mask1,
+                            const struct cb_info *cbi, size_t start,
+                            size_t end) {
+    const u8 *d = buf + start, *e = buf + end;
+    assert(d < e);
+
+    for (; d < e; d += 16) {
+        m128 v = noCase ? and128(load128(d), caseMask) : load128(d);
+
+        u32 z = movemask128(eq128(mask1, v));
+
+        // On large packet buffers, this prefetch appears to get us about 2%.
+        __builtin_prefetch(d + 128);
+
+        SINGLE_ZSCAN();
+    }
+    return HWLM_SUCCESS;
+}
+
+static really_inline
+hwlm_error_t scanDoubleFast(const struct noodTable *n, const u8 *buf,
+                            size_t len, bool noCase, m128 caseMask, m128 mask1,
+                            m128 mask2, const struct cb_info *cbi, size_t start,
+                            size_t end) {
+    const u8 *d = buf + start, *e = buf + end;
+    assert(d < e);
+    m128 lastz1 = zeroes128();
+
+    for (; d < e; d += 16) {
+        m128 v = noCase ? and128(load128(d), caseMask) : load128(d);
+        m128 z1 = eq128(mask1, v);
+        m128 z2 = eq128(mask2, v);
+        u32 z = movemask128(and128(palignr(z1, lastz1, 15), z2));
+        lastz1 = z1;
+
+        // On large packet buffers, this prefetch appears to get us about 2%.
+        __builtin_prefetch(d + 128);
+        DEBUG_PRINTF("z 0x%08x\n", z);
+        DOUBLE_ZSCAN();
+    }
+    return HWLM_SUCCESS;
+}
diff --git a/regex/hwlm/noodle_internal.h b/regex/hwlm/noodle_internal.h
new file mode 100644
index 000000000..8f76f177e
--- /dev/null
+++ b/regex/hwlm/noodle_internal.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Data structures for Noodle literal matcher engine.
+ */
+
+#ifndef NOODLE_INTERNAL_H
+#define NOODLE_INTERNAL_H
+
+#include "ue2common.h"
+
+struct noodTable {
+    u32 id;
+    u64a msk;
+    u64a cmp;
+    u8 msk_len;
+    u8 key_offset;
+    u8 nocase;
+    u8 single;
+    u8 key0;
+    u8 key1;
+};
+
+#endif /* NOODLE_INTERNAL_H */
+
diff --git a/regex/kmod/.clang-format b/regex/kmod/.clang-format
new file mode 100644
index 000000000..1247d54f9
--- /dev/null
+++ b/regex/kmod/.clang-format
@@ -0,0 +1,683 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# clang-format configuration file. Intended for clang-format >= 11.
+#
+# For more information, see:
+#
+#   Documentation/process/clang-format.rst
+#   https://clang.llvm.org/docs/ClangFormat.html
+#   https://clang.llvm.org/docs/ClangFormatStyleOptions.html
+#
+---
+AccessModifierOffset: -4
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: Left
+AlignOperands: true
+AlignTrailingComments: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: None
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: false
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:
+  AfterClass: false
+  AfterControlStatement: false
+  AfterEnum: false
+  AfterFunction: true
+  AfterNamespace: true
+  AfterObjCDeclaration: false
+  AfterStruct: false
+  AfterUnion: false
+  AfterExternBlock: false
+  BeforeCatch: false
+  BeforeElse: false
+  IndentBraces: false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Custom
+BreakBeforeInheritanceComma: false
+BreakBeforeTernaryOperators: false
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeComma
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: false
+ColumnLimit: 80
+CommentPragmas: '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerIndentWidth: 8
+ContinuationIndentWidth: 8
+Cpp11BracedListStyle: false
+DerivePointerAlignment: false
+DisableFormat: false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: false
+
+# Taken from:
+#   git grep -h '^#define [^[:space:]]*for_each[^[:space:]]*(' include/ tools/ \
+#   | sed "s,^#define \([^[:space:]]*for_each[^[:space:]]*\)(.*$,  - '\1'," \
+#   | LC_ALL=C sort -u
+ForEachMacros:
+  - '__ata_qc_for_each'
+  - '__bio_for_each_bvec'
+  - '__bio_for_each_segment'
+  - '__evlist__for_each_entry'
+  - '__evlist__for_each_entry_continue'
+  - '__evlist__for_each_entry_from'
+  - '__evlist__for_each_entry_reverse'
+  - '__evlist__for_each_entry_safe'
+  - '__for_each_mem_range'
+  - '__for_each_mem_range_rev'
+  - '__for_each_thread'
+  - '__hlist_for_each_rcu'
+  - '__map__for_each_symbol_by_name'
+  - '__perf_evlist__for_each_entry'
+  - '__perf_evlist__for_each_entry_reverse'
+  - '__perf_evlist__for_each_entry_safe'
+  - '__rq_for_each_bio'
+  - '__shost_for_each_device'
+  - 'apei_estatus_for_each_section'
+  - 'ata_for_each_dev'
+  - 'ata_for_each_link'
+  - 'ata_qc_for_each'
+  - 'ata_qc_for_each_raw'
+  - 'ata_qc_for_each_with_internal'
+  - 'ax25_for_each'
+  - 'ax25_uid_for_each'
+  - 'bio_for_each_bvec'
+  - 'bio_for_each_bvec_all'
+  - 'bio_for_each_folio_all'
+  - 'bio_for_each_integrity_vec'
+  - 'bio_for_each_segment'
+  - 'bio_for_each_segment_all'
+  - 'bio_list_for_each'
+  - 'bip_for_each_vec'
+  - 'bond_for_each_slave'
+  - 'bond_for_each_slave_rcu'
+  - 'bpf__perf_for_each_map'
+  - 'bpf__perf_for_each_map_named'
+  - 'bpf_for_each_spilled_reg'
+  - 'bpf_object__for_each_map'
+  - 'bpf_object__for_each_program'
+  - 'bpf_object__for_each_safe'
+  - 'bpf_perf_object__for_each'
+  - 'btree_for_each_safe128'
+  - 'btree_for_each_safe32'
+  - 'btree_for_each_safe64'
+  - 'btree_for_each_safel'
+  - 'card_for_each_dev'
+  - 'cgroup_taskset_for_each'
+  - 'cgroup_taskset_for_each_leader'
+  - 'cpufreq_for_each_efficient_entry_idx'
+  - 'cpufreq_for_each_entry'
+  - 'cpufreq_for_each_entry_idx'
+  - 'cpufreq_for_each_valid_entry'
+  - 'cpufreq_for_each_valid_entry_idx'
+  - 'css_for_each_child'
+  - 'css_for_each_descendant_post'
+  - 'css_for_each_descendant_pre'
+  - 'damon_for_each_region'
+  - 'damon_for_each_region_safe'
+  - 'damon_for_each_scheme'
+  - 'damon_for_each_scheme_safe'
+  - 'damon_for_each_target'
+  - 'damon_for_each_target_safe'
+  - 'data__for_each_file'
+  - 'data__for_each_file_new'
+  - 'data__for_each_file_start'
+  - 'device_for_each_child_node'
+  - 'displayid_iter_for_each'
+  - 'dma_fence_array_for_each'
+  - 'dma_fence_chain_for_each'
+  - 'dma_fence_unwrap_for_each'
+  - 'dma_resv_for_each_fence'
+  - 'dma_resv_for_each_fence_unlocked'
+  - 'do_for_each_ftrace_op'
+  - 'drm_atomic_crtc_for_each_plane'
+  - 'drm_atomic_crtc_state_for_each_plane'
+  - 'drm_atomic_crtc_state_for_each_plane_state'
+  - 'drm_atomic_for_each_plane_damage'
+  - 'drm_client_for_each_connector_iter'
+  - 'drm_client_for_each_modeset'
+  - 'drm_connector_for_each_possible_encoder'
+  - 'drm_for_each_bridge_in_chain'
+  - 'drm_for_each_connector_iter'
+  - 'drm_for_each_crtc'
+  - 'drm_for_each_crtc_reverse'
+  - 'drm_for_each_encoder'
+  - 'drm_for_each_encoder_mask'
+  - 'drm_for_each_fb'
+  - 'drm_for_each_legacy_plane'
+  - 'drm_for_each_plane'
+  - 'drm_for_each_plane_mask'
+  - 'drm_for_each_privobj'
+  - 'drm_mm_for_each_hole'
+  - 'drm_mm_for_each_node'
+  - 'drm_mm_for_each_node_in_range'
+  - 'drm_mm_for_each_node_safe'
+  - 'dsa_switch_for_each_available_port'
+  - 'dsa_switch_for_each_cpu_port'
+  - 'dsa_switch_for_each_port'
+  - 'dsa_switch_for_each_port_continue_reverse'
+  - 'dsa_switch_for_each_port_safe'
+  - 'dsa_switch_for_each_user_port'
+  - 'dsa_tree_for_each_user_port'
+  - 'dso__for_each_symbol'
+  - 'dsos__for_each_with_build_id'
+  - 'elf_hash_for_each_possible'
+  - 'elf_section__for_each_rel'
+  - 'elf_section__for_each_rela'
+  - 'elf_symtab__for_each_symbol'
+  - 'evlist__for_each_cpu'
+  - 'evlist__for_each_entry'
+  - 'evlist__for_each_entry_continue'
+  - 'evlist__for_each_entry_from'
+  - 'evlist__for_each_entry_reverse'
+  - 'evlist__for_each_entry_safe'
+  - 'flow_action_for_each'
+  - 'for_each_acpi_dev_match'
+  - 'for_each_active_dev_scope'
+  - 'for_each_active_drhd_unit'
+  - 'for_each_active_iommu'
+  - 'for_each_aggr_pgid'
+  - 'for_each_available_child_of_node'
+  - 'for_each_bench'
+  - 'for_each_bio'
+  - 'for_each_board_func_rsrc'
+  - 'for_each_btf_ext_rec'
+  - 'for_each_btf_ext_sec'
+  - 'for_each_bvec'
+  - 'for_each_card_auxs'
+  - 'for_each_card_auxs_safe'
+  - 'for_each_card_components'
+  - 'for_each_card_dapms'
+  - 'for_each_card_pre_auxs'
+  - 'for_each_card_prelinks'
+  - 'for_each_card_rtds'
+  - 'for_each_card_rtds_safe'
+  - 'for_each_card_widgets'
+  - 'for_each_card_widgets_safe'
+  - 'for_each_cgroup_storage_type'
+  - 'for_each_child_of_node'
+  - 'for_each_clear_bit'
+  - 'for_each_clear_bit_from'
+  - 'for_each_clear_bitrange'
+  - 'for_each_clear_bitrange_from'
+  - 'for_each_cmd'
+  - 'for_each_cmsghdr'
+  - 'for_each_collection'
+  - 'for_each_comp_order'
+  - 'for_each_compatible_node'
+  - 'for_each_component_dais'
+  - 'for_each_component_dais_safe'
+  - 'for_each_console'
+  - 'for_each_cpu'
+  - 'for_each_cpu_and'
+  - 'for_each_cpu_not'
+  - 'for_each_cpu_wrap'
+  - 'for_each_dapm_widgets'
+  - 'for_each_dedup_cand'
+  - 'for_each_dev_addr'
+  - 'for_each_dev_scope'
+  - 'for_each_dma_cap_mask'
+  - 'for_each_dpcm_be'
+  - 'for_each_dpcm_be_rollback'
+  - 'for_each_dpcm_be_safe'
+  - 'for_each_dpcm_fe'
+  - 'for_each_drhd_unit'
+  - 'for_each_dss_dev'
+  - 'for_each_efi_memory_desc'
+  - 'for_each_efi_memory_desc_in_map'
+  - 'for_each_element'
+  - 'for_each_element_extid'
+  - 'for_each_element_id'
+  - 'for_each_endpoint_of_node'
+  - 'for_each_event'
+  - 'for_each_event_tps'
+  - 'for_each_evictable_lru'
+  - 'for_each_fib6_node_rt_rcu'
+  - 'for_each_fib6_walker_rt'
+  - 'for_each_free_mem_pfn_range_in_zone'
+  - 'for_each_free_mem_pfn_range_in_zone_from'
+  - 'for_each_free_mem_range'
+  - 'for_each_free_mem_range_reverse'
+  - 'for_each_func_rsrc'
+  - 'for_each_group_evsel'
+  - 'for_each_group_member'
+  - 'for_each_hstate'
+  - 'for_each_if'
+  - 'for_each_inject_fn'
+  - 'for_each_insn'
+  - 'for_each_insn_prefix'
+  - 'for_each_intid'
+  - 'for_each_iommu'
+  - 'for_each_ip_tunnel_rcu'
+  - 'for_each_irq_nr'
+  - 'for_each_lang'
+  - 'for_each_link_codecs'
+  - 'for_each_link_cpus'
+  - 'for_each_link_platforms'
+  - 'for_each_lru'
+  - 'for_each_matching_node'
+  - 'for_each_matching_node_and_match'
+  - 'for_each_mem_pfn_range'
+  - 'for_each_mem_range'
+  - 'for_each_mem_range_rev'
+  - 'for_each_mem_region'
+  - 'for_each_member'
+  - 'for_each_memory'
+  - 'for_each_migratetype_order'
+  - 'for_each_missing_reg'
+  - 'for_each_net'
+  - 'for_each_net_continue_reverse'
+  - 'for_each_net_rcu'
+  - 'for_each_netdev'
+  - 'for_each_netdev_continue'
+  - 'for_each_netdev_continue_rcu'
+  - 'for_each_netdev_continue_reverse'
+  - 'for_each_netdev_feature'
+  - 'for_each_netdev_in_bond_rcu'
+  - 'for_each_netdev_rcu'
+  - 'for_each_netdev_reverse'
+  - 'for_each_netdev_safe'
+  - 'for_each_new_connector_in_state'
+  - 'for_each_new_crtc_in_state'
+  - 'for_each_new_mst_mgr_in_state'
+  - 'for_each_new_plane_in_state'
+  - 'for_each_new_plane_in_state_reverse'
+  - 'for_each_new_private_obj_in_state'
+  - 'for_each_new_reg'
+  - 'for_each_node'
+  - 'for_each_node_by_name'
+  - 'for_each_node_by_type'
+  - 'for_each_node_mask'
+  - 'for_each_node_state'
+  - 'for_each_node_with_cpus'
+  - 'for_each_node_with_property'
+  - 'for_each_nonreserved_multicast_dest_pgid'
+  - 'for_each_of_allnodes'
+  - 'for_each_of_allnodes_from'
+  - 'for_each_of_cpu_node'
+  - 'for_each_of_pci_range'
+  - 'for_each_old_connector_in_state'
+  - 'for_each_old_crtc_in_state'
+  - 'for_each_old_mst_mgr_in_state'
+  - 'for_each_old_plane_in_state'
+  - 'for_each_old_private_obj_in_state'
+  - 'for_each_oldnew_connector_in_state'
+  - 'for_each_oldnew_crtc_in_state'
+  - 'for_each_oldnew_mst_mgr_in_state'
+  - 'for_each_oldnew_plane_in_state'
+  - 'for_each_oldnew_plane_in_state_reverse'
+  - 'for_each_oldnew_private_obj_in_state'
+  - 'for_each_online_cpu'
+  - 'for_each_online_node'
+  - 'for_each_online_pgdat'
+  - 'for_each_path'
+  - 'for_each_pci_bridge'
+  - 'for_each_pci_dev'
+  - 'for_each_pcm_streams'
+  - 'for_each_physmem_range'
+  - 'for_each_populated_zone'
+  - 'for_each_possible_cpu'
+  - 'for_each_present_cpu'
+  - 'for_each_prime_number'
+  - 'for_each_prime_number_from'
+  - 'for_each_probe_cache_entry'
+  - 'for_each_process'
+  - 'for_each_process_thread'
+  - 'for_each_prop_codec_conf'
+  - 'for_each_prop_dai_codec'
+  - 'for_each_prop_dai_cpu'
+  - 'for_each_prop_dlc_codecs'
+  - 'for_each_prop_dlc_cpus'
+  - 'for_each_prop_dlc_platforms'
+  - 'for_each_property_of_node'
+  - 'for_each_reg'
+  - 'for_each_reg_filtered'
+  - 'for_each_registered_fb'
+  - 'for_each_requested_gpio'
+  - 'for_each_requested_gpio_in_range'
+  - 'for_each_reserved_mem_range'
+  - 'for_each_reserved_mem_region'
+  - 'for_each_rtd_codec_dais'
+  - 'for_each_rtd_components'
+  - 'for_each_rtd_cpu_dais'
+  - 'for_each_rtd_dais'
+  - 'for_each_script'
+  - 'for_each_sec'
+  - 'for_each_set_bit'
+  - 'for_each_set_bit_from'
+  - 'for_each_set_bitrange'
+  - 'for_each_set_bitrange_from'
+  - 'for_each_set_clump8'
+  - 'for_each_sg'
+  - 'for_each_sg_dma_page'
+  - 'for_each_sg_page'
+  - 'for_each_sgtable_dma_page'
+  - 'for_each_sgtable_dma_sg'
+  - 'for_each_sgtable_page'
+  - 'for_each_sgtable_sg'
+  - 'for_each_shell_test'
+  - 'for_each_sibling_event'
+  - 'for_each_subelement'
+  - 'for_each_subelement_extid'
+  - 'for_each_subelement_id'
+  - 'for_each_sublist'
+  - 'for_each_subsystem'
+  - 'for_each_supported_activate_fn'
+  - 'for_each_supported_inject_fn'
+  - 'for_each_test'
+  - 'for_each_thread'
+  - 'for_each_token'
+  - 'for_each_unicast_dest_pgid'
+  - 'for_each_vsi'
+  - 'for_each_wakeup_source'
+  - 'for_each_zone'
+  - 'for_each_zone_zonelist'
+  - 'for_each_zone_zonelist_nodemask'
+  - 'func_for_each_insn'
+  - 'fwnode_for_each_available_child_node'
+  - 'fwnode_for_each_child_node'
+  - 'fwnode_graph_for_each_endpoint'
+  - 'gadget_for_each_ep'
+  - 'genradix_for_each'
+  - 'genradix_for_each_from'
+  - 'hash_for_each'
+  - 'hash_for_each_possible'
+  - 'hash_for_each_possible_rcu'
+  - 'hash_for_each_possible_rcu_notrace'
+  - 'hash_for_each_possible_safe'
+  - 'hash_for_each_rcu'
+  - 'hash_for_each_safe'
+  - 'hashmap__for_each_entry'
+  - 'hashmap__for_each_entry_safe'
+  - 'hashmap__for_each_key_entry'
+  - 'hashmap__for_each_key_entry_safe'
+  - 'hctx_for_each_ctx'
+  - 'hists__for_each_format'
+  - 'hists__for_each_sort_list'
+  - 'hlist_bl_for_each_entry'
+  - 'hlist_bl_for_each_entry_rcu'
+  - 'hlist_bl_for_each_entry_safe'
+  - 'hlist_for_each'
+  - 'hlist_for_each_entry'
+  - 'hlist_for_each_entry_continue'
+  - 'hlist_for_each_entry_continue_rcu'
+  - 'hlist_for_each_entry_continue_rcu_bh'
+  - 'hlist_for_each_entry_from'
+  - 'hlist_for_each_entry_from_rcu'
+  - 'hlist_for_each_entry_rcu'
+  - 'hlist_for_each_entry_rcu_bh'
+  - 'hlist_for_each_entry_rcu_notrace'
+  - 'hlist_for_each_entry_safe'
+  - 'hlist_for_each_entry_srcu'
+  - 'hlist_for_each_safe'
+  - 'hlist_nulls_for_each_entry'
+  - 'hlist_nulls_for_each_entry_from'
+  - 'hlist_nulls_for_each_entry_rcu'
+  - 'hlist_nulls_for_each_entry_safe'
+  - 'i3c_bus_for_each_i2cdev'
+  - 'i3c_bus_for_each_i3cdev'
+  - 'idr_for_each_entry'
+  - 'idr_for_each_entry_continue'
+  - 'idr_for_each_entry_continue_ul'
+  - 'idr_for_each_entry_ul'
+  - 'in_dev_for_each_ifa_rcu'
+  - 'in_dev_for_each_ifa_rtnl'
+  - 'inet_bind_bucket_for_each'
+  - 'inet_lhash2_for_each_icsk'
+  - 'inet_lhash2_for_each_icsk_continue'
+  - 'inet_lhash2_for_each_icsk_rcu'
+  - 'intlist__for_each_entry'
+  - 'intlist__for_each_entry_safe'
+  - 'kcore_copy__for_each_phdr'
+  - 'key_for_each'
+  - 'key_for_each_safe'
+  - 'klp_for_each_func'
+  - 'klp_for_each_func_safe'
+  - 'klp_for_each_func_static'
+  - 'klp_for_each_object'
+  - 'klp_for_each_object_safe'
+  - 'klp_for_each_object_static'
+  - 'kunit_suite_for_each_test_case'
+  - 'kvm_for_each_memslot'
+  - 'kvm_for_each_memslot_in_gfn_range'
+  - 'kvm_for_each_vcpu'
+  - 'libbpf_nla_for_each_attr'
+  - 'list_for_each'
+  - 'list_for_each_codec'
+  - 'list_for_each_codec_safe'
+  - 'list_for_each_continue'
+  - 'list_for_each_entry'
+  - 'list_for_each_entry_continue'
+  - 'list_for_each_entry_continue_rcu'
+  - 'list_for_each_entry_continue_reverse'
+  - 'list_for_each_entry_from'
+  - 'list_for_each_entry_from_rcu'
+  - 'list_for_each_entry_from_reverse'
+  - 'list_for_each_entry_lockless'
+  - 'list_for_each_entry_rcu'
+  - 'list_for_each_entry_reverse'
+  - 'list_for_each_entry_safe'
+  - 'list_for_each_entry_safe_continue'
+  - 'list_for_each_entry_safe_from'
+  - 'list_for_each_entry_safe_reverse'
+  - 'list_for_each_entry_srcu'
+  - 'list_for_each_from'
+  - 'list_for_each_prev'
+  - 'list_for_each_prev_safe'
+  - 'list_for_each_safe'
+  - 'llist_for_each'
+  - 'llist_for_each_entry'
+  - 'llist_for_each_entry_safe'
+  - 'llist_for_each_safe'
+  - 'map__for_each_symbol'
+  - 'map__for_each_symbol_by_name'
+  - 'map_for_each_event'
+  - 'map_for_each_metric'
+  - 'maps__for_each_entry'
+  - 'maps__for_each_entry_safe'
+  - 'mci_for_each_dimm'
+  - 'media_device_for_each_entity'
+  - 'media_device_for_each_intf'
+  - 'media_device_for_each_link'
+  - 'media_device_for_each_pad'
+  - 'msi_for_each_desc'
+  - 'nanddev_io_for_each_page'
+  - 'netdev_for_each_lower_dev'
+  - 'netdev_for_each_lower_private'
+  - 'netdev_for_each_lower_private_rcu'
+  - 'netdev_for_each_mc_addr'
+  - 'netdev_for_each_uc_addr'
+  - 'netdev_for_each_upper_dev_rcu'
+  - 'netdev_hw_addr_list_for_each'
+  - 'nft_rule_for_each_expr'
+  - 'nla_for_each_attr'
+  - 'nla_for_each_nested'
+  - 'nlmsg_for_each_attr'
+  - 'nlmsg_for_each_msg'
+  - 'nr_neigh_for_each'
+  - 'nr_neigh_for_each_safe'
+  - 'nr_node_for_each'
+  - 'nr_node_for_each_safe'
+  - 'of_for_each_phandle'
+  - 'of_property_for_each_string'
+  - 'of_property_for_each_u32'
+  - 'pci_bus_for_each_resource'
+  - 'pci_doe_for_each_off'
+  - 'pcl_for_each_chunk'
+  - 'pcl_for_each_segment'
+  - 'pcm_for_each_format'
+  - 'perf_config_items__for_each_entry'
+  - 'perf_config_sections__for_each_entry'
+  - 'perf_config_set__for_each_entry'
+  - 'perf_cpu_map__for_each_cpu'
+  - 'perf_evlist__for_each_entry'
+  - 'perf_evlist__for_each_entry_reverse'
+  - 'perf_evlist__for_each_entry_safe'
+  - 'perf_evlist__for_each_evsel'
+  - 'perf_evlist__for_each_mmap'
+  - 'perf_hpp_list__for_each_format'
+  - 'perf_hpp_list__for_each_format_safe'
+  - 'perf_hpp_list__for_each_sort_list'
+  - 'perf_hpp_list__for_each_sort_list_safe'
+  - 'perf_pmu__for_each_hybrid_pmu'
+  - 'ping_portaddr_for_each_entry'
+  - 'plist_for_each'
+  - 'plist_for_each_continue'
+  - 'plist_for_each_entry'
+  - 'plist_for_each_entry_continue'
+  - 'plist_for_each_entry_safe'
+  - 'plist_for_each_safe'
+  - 'pnp_for_each_card'
+  - 'pnp_for_each_dev'
+  - 'protocol_for_each_card'
+  - 'protocol_for_each_dev'
+  - 'queue_for_each_hw_ctx'
+  - 'radix_tree_for_each_slot'
+  - 'radix_tree_for_each_tagged'
+  - 'rb_for_each'
+  - 'rbtree_postorder_for_each_entry_safe'
+  - 'rdma_for_each_block'
+  - 'rdma_for_each_port'
+  - 'rdma_umem_for_each_dma_block'
+  - 'resort_rb__for_each_entry'
+  - 'resource_list_for_each_entry'
+  - 'resource_list_for_each_entry_safe'
+  - 'rhl_for_each_entry_rcu'
+  - 'rhl_for_each_rcu'
+  - 'rht_for_each'
+  - 'rht_for_each_entry'
+  - 'rht_for_each_entry_from'
+  - 'rht_for_each_entry_rcu'
+  - 'rht_for_each_entry_rcu_from'
+  - 'rht_for_each_entry_safe'
+  - 'rht_for_each_from'
+  - 'rht_for_each_rcu'
+  - 'rht_for_each_rcu_from'
+  - 'rq_for_each_bvec'
+  - 'rq_for_each_segment'
+  - 'rq_list_for_each'
+  - 'rq_list_for_each_safe'
+  - 'scsi_for_each_prot_sg'
+  - 'scsi_for_each_sg'
+  - 'sctp_for_each_hentry'
+  - 'sctp_skb_for_each'
+  - 'sec_for_each_insn'
+  - 'sec_for_each_insn_continue'
+  - 'sec_for_each_insn_from'
+  - 'shdma_for_each_chan'
+  - 'shost_for_each_device'
+  - 'sk_for_each'
+  - 'sk_for_each_bound'
+  - 'sk_for_each_entry_offset_rcu'
+  - 'sk_for_each_from'
+  - 'sk_for_each_rcu'
+  - 'sk_for_each_safe'
+  - 'sk_nulls_for_each'
+  - 'sk_nulls_for_each_from'
+  - 'sk_nulls_for_each_rcu'
+  - 'snd_array_for_each'
+  - 'snd_pcm_group_for_each_entry'
+  - 'snd_soc_dapm_widget_for_each_path'
+  - 'snd_soc_dapm_widget_for_each_path_safe'
+  - 'snd_soc_dapm_widget_for_each_sink_path'
+  - 'snd_soc_dapm_widget_for_each_source_path'
+  - 'strlist__for_each_entry'
+  - 'strlist__for_each_entry_safe'
+  - 'sym_for_each_insn'
+  - 'sym_for_each_insn_continue_reverse'
+  - 'symbols__for_each_entry'
+  - 'tb_property_for_each'
+  - 'tcf_act_for_each_action'
+  - 'tcf_exts_for_each_action'
+  - 'udp_portaddr_for_each_entry'
+  - 'udp_portaddr_for_each_entry_rcu'
+  - 'usb_hub_for_each_child'
+  - 'v4l2_device_for_each_subdev'
+  - 'v4l2_m2m_for_each_dst_buf'
+  - 'v4l2_m2m_for_each_dst_buf_safe'
+  - 'v4l2_m2m_for_each_src_buf'
+  - 'v4l2_m2m_for_each_src_buf_safe'
+  - 'virtio_device_for_each_vq'
+  - 'while_for_each_ftrace_op'
+  - 'xa_for_each'
+  - 'xa_for_each_marked'
+  - 'xa_for_each_range'
+  - 'xa_for_each_start'
+  - 'xas_for_each'
+  - 'xas_for_each_conflict'
+  - 'xas_for_each_marked'
+  - 'xbc_array_for_each_value'
+  - 'xbc_for_each_key_value'
+  - 'xbc_node_for_each_array_value'
+  - 'xbc_node_for_each_child'
+  - 'xbc_node_for_each_key_value'
+  - 'xbc_node_for_each_subkey'
+  - 'zorro_for_each_dev'
+
+IncludeBlocks: Preserve
+IncludeCategories:
+  - Regex: '.*'
+    Priority: 1
+IncludeIsMainRegex: '(Test)?$'
+IndentCaseLabels: false
+IndentGotoLabels: false
+IndentPPDirectives: None
+IndentWidth: 8
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd: ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Auto
+ObjCBlockIndentWidth: 8
+ObjCSpaceAfterProperty: true
+ObjCSpaceBeforeProtocolList: true
+
+# Taken from git's rules
+PenaltyBreakAssignment: 10
+PenaltyBreakBeforeFirstCallParameter: 30
+PenaltyBreakComment: 10
+PenaltyBreakFirstLessLess: 0
+PenaltyBreakString: 10
+PenaltyExcessCharacter: 100
+PenaltyReturnTypeOnItsOwnLine: 60
+
+PointerAlignment: Right
+ReflowComments: false
+SortIncludes: false
+SortUsingDeclarations: false
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatementsExceptForEachMacros
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles: false
+SpacesInContainerLiterals: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard: Cpp03
+TabWidth: 8
+UseTab: Always
+...
diff --git a/regex/kmod/config.h b/regex/kmod/config.h
new file mode 100644
index 000000000..22f0341f2
--- /dev/null
+++ b/regex/kmod/config.h
@@ -0,0 +1,109 @@
+/* used by cmake */
+
+#ifndef CONFIG_H_
+#define CONFIG_H_
+
+/* "Define if the build is 32 bit" */
+/* #undef ARCH_32_BIT */
+
+/* "Define if the build is 64 bit" */
+#define ARCH_64_BIT
+
+/* "Define if building for IA32" */
+/* #undef ARCH_IA32 */
+
+/* "Define if building for EM64T" */
+#define ARCH_X86_64
+
+/* internal build, switch on dump support. */
+#define DUMP_SUPPORT
+
+/* Define if building "fat" runtime. */
+/* #undef FAT_RUNTIME */
+
+/* Define if building AVX-512 in the fat runtime. */
+/* #undef BUILD_AVX512 */
+
+/* Define if building AVX512VBMI in the fat runtime. */
+/* #undef BUILD_AVX512VBMI */
+
+/* Define to 1 if `backtrace' works. */
+#define HAVE_BACKTRACE
+
+/* C compiler has __builtin_assume_aligned */
+#define HAVE_CC_BUILTIN_ASSUME_ALIGNED
+
+/* C++ compiler has __builtin_assume_aligned */
+#define HAVE_CXX_BUILTIN_ASSUME_ALIGNED
+
+/* C++ compiler has x86intrin.h */
+#define HAVE_CXX_X86INTRIN_H
+
+/* C compiler has x86intrin.h */
+#define HAVE_C_X86INTRIN_H
+
+/* C++ compiler has intrin.h */
+/* #undef HAVE_CXX_INTRIN_H */
+
+/* C compiler has intrin.h */
+/* #undef HAVE_C_INTRIN_H */
+
+/* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to
+   0 if you don't. */
+/* #undef HAVE_DECL_PTHREAD_SETAFFINITY_NP */
+
+/* #undef HAVE_PTHREAD_NP_H */
+
+/* Define to 1 if you have the `malloc_info' function. */
+/* #undef HAVE_MALLOC_INFO */
+
+/* Define to 1 if you have the `memmem' function. */
+/* #undef HAVE_MEMMEM */
+
+/* Define to 1 if you have a working `mmap' system call. */
+/* #undef HAVE_MMAP */
+
+/* Define to 1 if `posix_memalign' works. */
+/* #undef HAVE_POSIX_MEMALIGN */
+
+/* Define to 1 if you have the `setrlimit' function. */
+/* #undef HAVE_SETRLIMIT */
+
+/* Define to 1 if you have the `shmget' function. */
+/* #undef HAVE_SHMGET */
+
+/* Define to 1 if you have the `sigaction' function. */
+/* #undef HAVE_SIGACTION */
+
+/* Define to 1 if you have the `sigaltstack' function. */
+/* #undef HAVE_SIGALTSTACK  */
+
+/* Define if the sqlite3_open_v2 call is available */
+/* #undef HAVE_SQLITE3_OPEN_V2 */
+
+/* Define to 1 if you have the <unistd.h> header file. */
+/* #undef HAVE_UNISTD_H */
+
+/* Define to 1 if you have the `_aligned_malloc' function. */
+/* #undef HAVE__ALIGNED_MALLOC */
+
+/* Define if compiler has __builtin_constant_p */
+#define HAVE__BUILTIN_CONSTANT_P
+
+/* Optimize, inline critical functions */
+#define HS_OPTIMIZE
+
+#define HS_VERSION
+#define HS_MAJOR_VERSION
+#define HS_MINOR_VERSION
+/* #undef HS_PATCH_VERSION */
+
+#define BUILD_DATE
+
+/* define if this is a release build. */
+#define RELEASE_BUILD
+
+/* define if reverse_graph requires patch for boost 1.62.0 */
+/* #undef BOOST_REVGRAPH_PATCH */
+
+#endif /* CONFIG_H_ */
diff --git a/regex/kmod/hs_version.h b/regex/kmod/hs_version.h
new file mode 100644
index 000000000..f6fd235ba
--- /dev/null
+++ b/regex/kmod/hs_version.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef HS_VERSION_H_C6428FAF8E3713
+#define HS_VERSION_H_C6428FAF8E3713
+
+/**
+ * A version string to identify this release of Hyperscan.
+ */
+#define HS_VERSION_STRING "5.4.0 2022-03-31"
+
+#define HS_VERSION_32BIT ((5 << 24) | (4 << 16) | (0 << 8) | 0)
+
+#endif /* HS_VERSION_H_C6428FAF8E3713 */
diff --git a/regex/kmod/rex.c b/regex/kmod/rex.c
new file mode 100644
index 000000000..69d8638a3
--- /dev/null
+++ b/regex/kmod/rex.c
@@ -0,0 +1,649 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/* SPDX-FileCopyrightText: Copyright 2022 G-Core Labs S.A. */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#define CREATE_TRACE_POINTS
+#include "rex_trace.h"
+#include "rex.h"
+
+#include "hs_runtime.h"
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/types.h>
+#include <linux/configfs.h>
+#include <linux/printk.h>
+#include <linux/idr.h>
+#include <linux/cpumask.h>
+#include <linux/mutex.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+//#include "fw/str.h"
+//#include <net/xdp.h>
+
+static ulong max_db_size = 4 << 20;
+module_param(max_db_size, ulong, 0644);
+MODULE_PARM_DESC(max_db_size, "Maximum size of configfs upload, default=4MB");
+
+static DEFINE_IDR(rex_idr);
+static DEFINE_MUTEX(rex_config_mutex);
+
+/** A wrapper around hs_database_t where we may store additional fields. */
+struct rex_database {
+	void __percpu *scratch; /* TODO: make it global */
+	u8 bytes[] __aligned(8);
+};
+
+static inline hs_database_t *patterns(struct rex_database *db)
+{
+	if (!db)
+		return NULL;
+	return (hs_database_t *)db->bytes;
+}
+
+/**
+ * Represent a configurable hyperscan database.
+ * @id:		Handle used by BPF programs from rex_scan_bytes() kfunc (rw).
+ * @epoch:	Sequential number which may be used to detect changes (ro).
+ * @note:	An arbitrary user string (rw).
+ * @database:	Compiled database binary (rw).
+ *
+ * Contains other derived read-only parameters:
+ * /info:	Brief database description.
+ *
+ */
+struct rex_policy {
+	u32 id;
+	u32 epoch;
+	struct mutex lock;
+	struct rex_database __rcu *database;
+	struct config_item item;
+	char note[PAGE_SIZE];
+};
+
+struct rex_scan_ctx {
+	struct rex_scan_attr *attr;
+	const void *block;
+	size_t block_len;
+};
+
+static int rex_scan_cb(unsigned int expression, unsigned long long from,
+		       unsigned long long to, unsigned int flags, void *raw_ctx)
+{
+	struct rex_scan_ctx *ctx = raw_ctx;
+	struct rex_scan_attr *attr = ctx->attr;
+	u32 features = attr->handler_flags;
+
+	attr->last_event = (struct rex_event){
+		.expression = expression,
+		.from = from,
+		.to = to,
+		.flags = flags,
+	};
+
+	trace_rex_match(attr);
+	attr->nr_events += 1;
+
+	return (features & REX_SINGLE_SHOT) ? 1 : 0;
+}
+
+int bpf_scan_bytes(const void *buf, __u32 buf__sz, struct rex_scan_attr *attr)
+{
+	struct rex_scan_ctx ctx = {
+		.attr = attr,
+		.block = buf,
+		.block_len = buf__sz,
+	};
+	struct rex_policy *rex;
+	struct rex_database *db;
+	hs_scratch_t *scratch;
+	hs_error_t err;
+
+	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+
+	if (unlikely(!buf || !attr))
+		return -EINVAL;
+
+	rex = idr_find(&rex_idr, attr->database_id);
+	if (unlikely(!rex))
+		return -EBADF;
+
+	db = rcu_dereference(rex->database);
+	if (unlikely(!db))
+		return -ENODATA;
+
+	scratch = this_cpu_ptr(db->scratch);
+
+	kernel_fpu_begin();
+	err = hs_scan(patterns(db), buf, buf__sz, 0, scratch, rex_scan_cb,
+		      &ctx);
+	kernel_fpu_end();
+
+	switch (err) {
+	case HS_DB_MODE_ERROR:
+		return -ENOEXEC;
+	case HS_SCAN_TERMINATED:
+		return 1;
+	case HS_SUCCESS:
+		return 0;
+	case HS_SCRATCH_IN_USE:
+	case HS_INVALID:
+	case HS_UNKNOWN_ERROR:
+	default:
+		WARN(1, "hs_scan() failed with code %d\n", (int)err);
+		return -EFAULT;
+	}
+}
+EXPORT_SYMBOL(bpf_scan_bytes);
+
+int bpf_scan_vector(const char *const *buf,
+                    const unsigned int *length,
+                    __u32 buf__sz,
+                    struct rex_scan_attr *attr)
+{
+	struct rex_scan_ctx ctx = {
+		.attr = attr,
+		.block = buf,
+		.block_len = buf__sz,
+	};
+	struct rex_policy *rex;
+	struct rex_database *db;
+	hs_scratch_t *scratch;
+	hs_error_t err;
+
+	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+
+	if (unlikely(!buf || !attr))
+		return -EINVAL;
+
+	rex = idr_find(&rex_idr, attr->database_id);
+	if (unlikely(!rex))
+		return -EBADF;
+
+	db = rcu_dereference(rex->database);
+	if (unlikely(!db))
+		return -ENODATA;
+
+	scratch = this_cpu_ptr(db->scratch);
+
+	kernel_fpu_begin();
+	err = hs_scan_vector(patterns(db), buf, length, buf__sz, 0,
+	                     scratch, rex_scan_cb, &ctx);
+	kernel_fpu_end();
+
+	switch (err) {
+	case HS_DB_MODE_ERROR:
+		return -ENOEXEC;
+	case HS_SCAN_TERMINATED:
+		return 1;
+	case HS_SUCCESS:
+		return 0;
+	case HS_SCRATCH_IN_USE:
+	case HS_INVALID:
+	case HS_UNKNOWN_ERROR:
+	default:
+		WARN(1, "hs_scan() failed with code %d\n", (int)err);
+		return -EFAULT;
+	}
+}
+EXPORT_SYMBOL(bpf_scan_vector);
+
+int bpf_scan_tfwstr(const TfwStr *str,
+                    struct rex_scan_attr *attr)
+{
+	struct rex_scan_ctx ctx = {
+		.attr = attr,
+		.block = str,
+		.block_len = str->len,
+	};
+	struct rex_policy *rex;
+	struct rex_database *db;
+	hs_scratch_t *scratch;
+	hs_error_t err;
+
+	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+
+	if (unlikely(!str || !attr))
+		return -EINVAL;
+
+	rex = idr_find(&rex_idr, attr->database_id);
+	if (unlikely(!rex))
+		return -EBADF;
+
+	db = rcu_dereference(rex->database);
+	if (unlikely(!db))
+		return -ENODATA;
+
+	scratch = this_cpu_ptr(db->scratch);
+
+	kernel_fpu_begin();
+
+
+	err = hs_scan_tfwstr(patterns(db), str, 0,
+	                     scratch, rex_scan_cb, &ctx);
+
+	kernel_fpu_end();
+
+	switch (err) {
+	case HS_DB_MODE_ERROR:
+		return -ENOEXEC;
+	case HS_SCAN_TERMINATED:
+		return 1;
+	case HS_SUCCESS:
+		return 0;
+	case HS_SCRATCH_IN_USE:
+	case HS_INVALID:
+	case HS_UNKNOWN_ERROR:
+	default:
+		WARN(1, "hs_scan() failed with code %d\n", (int)err);
+		return -EFAULT;
+	}
+}
+EXPORT_SYMBOL(bpf_scan_tfwstr);
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 18, 0)
+/* Based on code taken from net/core/filter.c */
+/*static void *bpf_xdp_pointer(const struct xdp_buff *xdp, u32 offset, u32 len)
+{
+	u32 size = xdp->data_end - xdp->data;
+	void *addr = xdp->data;
+
+	if (unlikely(offset > 0xffff || len > 0xffff))
+		return ERR_PTR(-EFAULT);
+
+	if (offset + len > size)
+		return ERR_PTR(-EINVAL);
+
+	return addr + offset;
+}*/
+#else
+#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0)
+/* This code is taken from net/core/filter.c */
+static void *bpf_xdp_pointer(const struct xdp_buff *xdp, u32 offset, u32 len)
+{
+	u32 size = xdp->data_end - xdp->data;
+	void *addr = xdp->data;
+	struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
+	int i;
+
+	if (unlikely(offset > 0xffff || len > 0xffff))
+		return ERR_PTR(-EFAULT);
+
+	if (offset + len > xdp_get_buff_len(xdp))
+		return ERR_PTR(-EINVAL);
+
+	if (offset < size) /* linear area */
+		goto out;
+
+	offset -= size;
+	for (i = 0; i < sinfo->nr_frags; i++) { /* paged area */
+		u32 frag_size = skb_frag_size(&sinfo->frags[i]);
+
+		if (offset < frag_size) {
+			addr = skb_frag_address(&sinfo->frags[i]);
+			size = frag_size;
+			break;
+		}
+		offset -= frag_size;
+	}
+out:
+	return offset + len < size ? addr + offset : NULL;
+}
+#endif
+#endif
+
+/*int bpf_xdp_scan_bytes(const struct xdp_md *xdp_md, u32 offset, u32 len,
+		       struct rex_scan_attr *scan_attr)
+{
+	struct xdp_buff *xdp = (struct xdp_buff *)xdp_md;
+	void *ptr = bpf_xdp_pointer(xdp, offset, len);
+
+	if (IS_ERR(ptr))
+		return PTR_ERR(ptr);
+
+	if (likely(ptr))
+		return bpf_scan_bytes(ptr, len, scan_attr);
+	else
+		return -ENOTSUPP;
+}
+EXPORT_SYMBOL(bpf_xdp_scan_bytes);
+
+BTF_SET_START(rex_kfunc_ids)
+BTF_ID(func, bpf_scan_bytes)
+BTF_ID(func, bpf_xdp_scan_bytes)
+BTF_SET_END(rex_kfunc_ids)
+static DEFINE_KFUNC_BTF_ID_SET(&rex_kfunc_ids, rex_kfunc_btf_set);*/
+
+static struct rex_policy *to_policy(struct config_item *item)
+{
+	return item ? container_of(item, struct rex_policy, item) : NULL;
+}
+
+static ssize_t rexcfg_database_read(struct config_item *item, void *outbuf,
+				    size_t size)
+{
+	struct rex_policy *rex = to_policy(item);
+	struct rex_database *db;
+	char *bytes = outbuf;
+	ssize_t ret;
+
+	rcu_read_lock();
+	db = rcu_dereference(rex->database);
+
+	if (!bytes) {
+		/* In first call return size for te buffer. */
+		if (hs_database_size(patterns(db), &ret))
+			ret = 0;
+	} else if (size > 0) {
+		/* In second call fill the buffer with data.
+		 * We have to check size again to avoid races.
+		 */
+		if (hs_database_size(patterns(db), &ret) || ret != size) {
+			ret = -ETXTBSY;
+			goto out;
+		}
+
+		if (hs_serialize_database(patterns(db), &bytes, NULL)) {
+			WARN(1, "hs_serialize_database() failed\n");
+			ret = -EIO;
+		}
+
+		/* Check that pointer wasn't overwritten. */
+		BUG_ON(bytes != outbuf);
+	} else {
+		return 0;
+	}
+
+out:
+	rcu_read_unlock();
+	return ret;
+}
+
+static void rex_assign_database(struct rex_policy *rex, struct rex_database *db)
+{
+	db = rcu_replace_pointer(rex->database, db,
+				 lockdep_is_held(&rex_config_mutex));
+	rex->epoch += 1;
+
+	if (db) {
+		synchronize_rcu();
+		free_percpu(db->scratch);
+		kfree(db);
+	}
+}
+
+static ssize_t rexcfg_database_write(struct config_item *item,
+				     const void *bytes, size_t nbytes)
+{
+	struct rex_policy *rex = to_policy(item);
+	struct rex_database *db;
+	hs_scratch_t *proto = NULL;
+	size_t alloc_size;
+	int cpu;
+
+	/* Drop existing database on empty write. */
+	if (nbytes == 0) {
+		mutex_lock(&rex_config_mutex);
+		rex_assign_database(rex, NULL);
+		mutex_unlock(&rex_config_mutex);
+		return nbytes;
+	}
+
+	if (hs_serialized_database_size(bytes, nbytes, &alloc_size))
+		return -EIO;
+
+	db = kmalloc(sizeof(*db) + alloc_size, GFP_KERNEL);
+	if (!db)
+		return -ENOMEM;
+
+	if (hs_deserialize_database_at(bytes, nbytes, patterns(db))) {
+		kfree(db);
+		return -EINVAL;
+	}
+
+	if (hs_alloc_scratch(patterns(db), &proto)) {
+		kfree(db);
+		return -ENOMEM;
+	}
+
+	BUG_ON(hs_scratch_size(proto, &alloc_size));
+	db->scratch = __alloc_percpu(alloc_size, 64);
+	if (!db->scratch) {
+		kfree(db);
+		hs_free_scratch(proto);
+		return -ENOMEM;
+	}
+
+	for_each_possible_cpu(cpu) {
+		hs_scratch_t *dst = per_cpu_ptr(db->scratch, cpu);
+
+		BUG_ON(hs_init_scratch(proto, dst));
+	}
+	hs_free_scratch(proto);
+
+	mutex_lock(&rex_config_mutex);
+	rex_assign_database(rex, db);
+	mutex_unlock(&rex_config_mutex);
+
+	return nbytes;
+}
+
+static ssize_t rexcfg_info_show(struct config_item *item, char *str)
+{
+	struct rex_policy *rex = to_policy(item);
+	struct rex_database *db;
+	char *info;
+	int ret = 0;
+
+	rcu_read_lock();
+
+	db = rcu_dereference(rex->database);
+	if (hs_database_info(patterns(db), &info)) {
+		ret = -EIO;
+		goto out;
+	}
+
+	ret += sysfs_emit_at(str, ret, "%s\n", info);
+	kfree(info);
+
+out:
+	rcu_read_unlock();
+	return ret;
+}
+
+static ssize_t rexcfg_epoch_show(struct config_item *item, char *str)
+{
+	return snprintf(str, PAGE_SIZE, "%d\n", to_policy(item)->epoch);
+}
+
+static ssize_t rexcfg_id_show(struct config_item *item, char *str)
+{
+	return snprintf(str, PAGE_SIZE, "%d\n", to_policy(item)->id);
+}
+
+static ssize_t rexcfg_id_store(struct config_item *item, const char *str,
+			       size_t length)
+{
+	struct rex_policy *rex = to_policy(item);
+	int ret, new_id;
+
+	ret = kstrtoint(str, 0, &new_id);
+	if (ret < 0)
+		return -EINVAL;
+
+	mutex_lock(&rex_config_mutex);
+
+	if (rex->id == new_id) {
+		ret = length;
+		goto out;
+	}
+
+	ret = idr_alloc(&rex_idr, rex, new_id, new_id + 1, GFP_KERNEL);
+	if (ret < 0)
+		goto out;
+
+	BUG_ON(idr_remove(&rex_idr, rex->id) != rex);
+	rex->id = new_id;
+	ret = length;
+
+out:
+	mutex_unlock(&rex_config_mutex);
+	return ret;
+}
+
+static ssize_t rexcfg_note_show(struct config_item *item, char *str)
+{
+	struct rex_policy *rex = to_policy(item);
+	int ret;
+
+	mutex_lock(&rex->lock);
+	ret = snprintf(str, PAGE_SIZE, "%s", to_policy(item)->note);
+	mutex_unlock(&rex->lock);
+
+	return ret;
+}
+
+static ssize_t rexcfg_note_store(struct config_item *item, const char *str,
+				 size_t length)
+{
+	struct rex_policy *rex = to_policy(item);
+
+	mutex_lock(&rex->lock);
+	strncpy(rex->note, str, length);
+	mutex_unlock(&rex->lock);
+
+	return length;
+}
+
+/* Our subsystem hierarchy is:
+ *
+ * /sys/kernel/config/rex/
+ *		|
+ *		<policy>/
+ *		|	id		(rw)
+ *		|	database	(rw)
+ *		|	epoch		(ro)
+ *		|	info		(ro)
+ *		|	note		(rw)
+ *		|
+ *		<policy>/...
+ */
+
+CONFIGFS_BIN_ATTR(rexcfg_, database, NULL, 0);
+CONFIGFS_ATTR_RO(rexcfg_, epoch);
+CONFIGFS_ATTR_RO(rexcfg_, info);
+CONFIGFS_ATTR(rexcfg_, id);
+CONFIGFS_ATTR(rexcfg_, note);
+
+static void rexcfg_item_release(struct config_item *item)
+{
+	struct rex_policy *rex = to_policy(item);
+
+	mutex_lock(&rex_config_mutex);
+	BUG_ON(idr_remove(&rex_idr, rex->id) != rex);
+	rex_assign_database(rex, NULL);
+	mutex_unlock(&rex_config_mutex);
+}
+
+static const struct config_item_type rex_type = {
+	.ct_owner = THIS_MODULE,
+	.ct_attrs = (struct configfs_attribute *[]){ &rexcfg_attr_id,
+						     &rexcfg_attr_info,
+						     &rexcfg_attr_epoch,
+						     &rexcfg_attr_note, NULL },
+	.ct_bin_attrs =
+		(struct configfs_bin_attribute *[]){
+			&rexcfg_attr_database,
+			NULL,
+		},
+	.ct_item_ops =
+		&(struct configfs_item_operations){
+			.release = rexcfg_item_release,
+		}
+};
+
+static struct config_item *rex_make_item(struct config_group *group,
+					 const char *name)
+{
+	struct rex_policy *rex;
+	int id;
+
+	rex = kzalloc(sizeof(*rex), GFP_KERNEL);
+	if (!rex)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_lock(&rex_config_mutex);
+
+	/* Patch database attribute type */
+	rexcfg_attr_database.cb_max_size = max_db_size;
+	config_item_init_type_name(&rex->item, name, &rex_type);
+
+	id = idr_alloc(&rex_idr, rex, 0, U32_MAX, GFP_KERNEL);
+	if (id < 0) {
+		kfree(rex);
+		return ERR_PTR(id);
+	}
+	rex->id = id;
+
+	mutex_unlock(&rex_config_mutex);
+
+	return &rex->item;
+}
+
+static const struct config_item_type rex_group_type = {
+	.ct_owner = THIS_MODULE,
+	.ct_group_ops =
+		&(struct configfs_group_operations){
+			.make_item = rex_make_item,
+		},
+};
+
+static struct configfs_subsystem rex_configfs = {
+    .su_mutex = __MUTEX_INITIALIZER(rex_configfs.su_mutex),
+    .su_group =
+        {
+            .cg_item =
+                {
+                    .ci_namebuf = "rex",
+                    .ci_type = &rex_group_type,
+                },
+        },
+};
+
+static void banner(void)
+{
+	pr_info("Hyperscan %s\n", hs_version());
+}
+
+static int __init rex_init(void)
+{
+	int err;
+
+	config_group_init(&rex_configfs.su_group);
+	err = configfs_register_subsystem(&rex_configfs);
+	if (err)
+		return err;
+
+	//register_btf_kfunc_id_set(&prog_test_kfunc_list, &rex_kfunc_btf_set);
+
+	banner();
+	return 0;
+}
+
+static void __exit rex_exit(void)
+{
+	//unregister_kfunc_btf_id_set(&prog_test_kfunc_list, &rex_kfunc_btf_set);
+	configfs_unregister_subsystem(&rex_configfs);
+	WARN_ON(!idr_is_empty(&rex_idr));
+	idr_destroy(&rex_idr);
+}
+
+module_init(rex_init);
+module_exit(rex_exit);
+
+/* Module information */
+MODULE_AUTHOR("Sergey Nizovtsev, sn@tempesta-tech.com");
+MODULE_DESCRIPTION("Hyperscan regex engine");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/regex/kmod/rex.h b/regex/kmod/rex.h
new file mode 100644
index 000000000..2f08d1394
--- /dev/null
+++ b/regex/kmod/rex.h
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/* SPDX-FileCopyrightText: Copyright 2022 G-Core Labs S.A. */
+
+#ifndef REX_ABI_USER_H
+#define REX_ABI_USER_H
+
+#if !defined(__bpf__)
+#include <linux/types.h>
+#include <linux/bpf.h>
+#define __ksym
+#endif
+
+#include "fw/str.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Structure describing a match event.
+ */
+struct rex_event {
+	unsigned int expression;
+	unsigned long long from;
+	unsigned long long to;
+	unsigned long long flags;
+};
+
+/* handler_flags */
+enum {
+	REX_SINGLE_SHOT = 1 << 0,
+};
+
+/**
+ * Attributes for bpf_scan_bytes() and bpf_xdp_scan_bytes().
+ *
+ * @database_id:	Numeric database handle taken from configfs (in).
+ * @handler_flags:	Customize match handler behaviour (in).
+ * @event_count:	Output number of events (inout).
+ * @last_event:		Space to store match details. (out).
+ */
+struct rex_scan_attr {
+	__u32 database_id;
+	__u32 handler_flags;
+	__u32 nr_events;
+	struct rex_event last_event;
+};
+
+#if defined(__KERNEL__) || defined(__bpf__)
+
+/**
+ * Scan any buffer against regex pattern database.
+ *
+ * @buf:		A pointer to a valid buffer.
+ * @buf__sz:		Number of bytes to scan.
+ * @scan_attr:		Input/output match attributes.
+ */
+int bpf_scan_bytes(const void *buf, __u32 buf__sz,
+		   struct rex_scan_attr *scan_attr) __ksym;
+
+int bpf_scan_vector(const char *const *buf, const unsigned int *length,
+                    __u32 buf__sz, struct rex_scan_attr *attr) __ksym;
+
+int bpf_scan_tfwstr(const TfwStr *str, struct rex_scan_attr *attr) __ksym;
+
+/**
+ * Scan @len packet bytes starting from @offset against pattern database.
+ * Similar to bpf_scan_bytes() but use XDP offsets to trick BPF verifier
+ *
+ * @xdp_md:		A pointer to struct xdp_buff* actually.
+ * @scan_attr:		Input/output match attributes.
+ */
+//int bpf_xdp_scan_bytes(const struct xdp_md *xdp_md, __u32 offset, __u32 len,
+//		       struct rex_scan_attr *scan_attr) __ksym;
+
+#endif /* __KERNEL__ or __bpf__ */
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif // REX_ABI_USER_H
diff --git a/regex/kmod/rex_trace.h b/regex/kmod/rex_trace.h
new file mode 100644
index 000000000..c7d5e943d
--- /dev/null
+++ b/regex/kmod/rex_trace.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/* SPDX-FileCopyrightText: Copyright 2022 G-Core Labs S.A. */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rex
+
+#if !defined(_TRACE_REX_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_REX_H
+
+#include <linux/tracepoint.h>
+#include "rex.h"
+
+TRACE_EVENT(rex_match, TP_PROTO(struct rex_scan_attr *ctx),
+
+	    TP_ARGS(ctx),
+
+	    TP_STRUCT__entry(__field(__u32, database_id) __field(__u32,
+								 event_index)
+				     __field_struct(struct rex_event, event)),
+
+	    TP_fast_assign(__entry->database_id = ctx->database_id;
+			   __entry->event_index = ctx->nr_events;
+			   __entry->event = ctx->last_event;),
+
+	    TP_printk("regex=%u/%u at [%llu, %llu]", __entry->database_id,
+		      __entry->event.expression, __entry->event.from,
+		      __entry->event.to));
+
+#endif /* _TRACE_REX_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE rex_trace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/regex/kmod/ue2common_kern.h b/regex/kmod/ue2common_kern.h
new file mode 100644
index 000000000..5b91b916a
--- /dev/null
+++ b/regex/kmod/ue2common_kern.h
@@ -0,0 +1,106 @@
+#ifndef UE2COMMON_KERN_H
+#define UE2COMMON_KERN_H
+
+#include "config.h"
+
+#ifndef pr_fmt
+#define pr_fmt(fmt) "hyperscan:%s: " fmt, __func__
+#endif
+
+/* standard types used across ue2 */
+
+/* We use the size_t type all over the place, usually defined in stddef.h. */
+#include <linux/stddef.h>
+/* stdint.h for things like uintptr_t and friends */
+#include <linux/types.h>
+#include <linux/bug.h>
+#include <linux/compiler_attributes.h>
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/minmax.h>
+#include <linux/printk.h>
+
+/* Linux kernel synonyms */
+#define FALLTHROUGH fallthrough
+#define ALIGN_ATTR(x) __aligned(x)
+#define ARRAY_LENGTH(a) ARRAY_SIZE(a)
+#define UNUSED __always_unused
+#define HS_PUBLIC_API /* nothing */
+
+#define ALIGN_DIRECTIVE __aligned(16)
+#define ALIGN_AVX_DIRECTIVE __aligned(32)
+#define ALIGN_CL_DIRECTIVE __aligned(64)
+
+/* We append the 'a' for aligned, since these aren't common, garden variety
+ * 64 bit values. The alignment is necessary for structs on some platforms,
+ * so we don't end up performing accidental unaligned accesses. */
+typedef u64 __aligned(8) u64a;
+typedef s64 __aligned(8) s64a;
+
+/* get the SIMD types */
+#include "util/simd_types.h"
+
+/** \brief Report identifier, used for internal IDs and external IDs (those
+ * reported on match). */
+typedef u32 ReportID;
+
+/** \brief Shorthand for the attribute to shut gcc about unused parameters */
+
+/* really_inline forces inlining always */
+#if defined(HS_OPTIMIZE)
+#define really_inline __always_inline __maybe_unused
+#else
+#define really_inline __maybe_unused
+#endif
+
+/** no, seriously, inline it, even if building in debug mode */
+#define really_really_inline __always_inline __maybe_unused
+#define never_inline noinline
+#define alignof __alignof
+
+/* We use C99-style "restrict". */
+#define restrict __restrict
+
+/* Align to 16-byte boundary */
+#define ROUNDUP_16(a) (((a) + 0xf) & ~0xf)
+#define ROUNDDOWN_16(a) ((a) & ~0xf)
+
+/* Align to N-byte boundary */
+#define ROUNDUP_N(a, n) (((a) + ((n)-1)) & ~((n)-1))
+#define ROUNDDOWN_N(a, n) ((a) & ~((n)-1))
+
+/* Align to a cacheline - assumed to be 64 bytes */
+#define ROUNDUP_CL(a) ROUNDUP_N(a, 64)
+
+/* Align ptr to next N-byte boundary */
+#define ROUNDUP_PTR(ptr, n) (__typeof__(ptr))(ROUNDUP_N((uintptr_t)(ptr), (n)))
+#define ROUNDDOWN_PTR(ptr, n) \
+	(__typeof__(ptr))(ROUNDDOWN_N((uintptr_t)(ptr), (n)))
+
+#define ISALIGNED_N(ptr, n) (((uintptr_t)(ptr) & ((n)-1)) == 0)
+#define ISALIGNED_16(ptr) ISALIGNED_N((ptr), 16)
+#define ISALIGNED_CL(ptr) ISALIGNED_N((ptr), 64)
+#define ISALIGNED(ptr) ISALIGNED_N((ptr), alignof(__typeof__(*(ptr))))
+#define N_CHARS 256
+
+/* Maximum offset representable in the 'unsigned long long' we use to return
+   offset values. */
+#define MAX_OFFSET 0xffffffffffffffffULL
+
+#if 0
+/* Produces lots of warnings about implicit integer casts */
+#define MIN min
+#define MAX max
+#else
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#endif
+
+#define LIMIT_TO_AT_MOST(a, b) (*(a) = MIN(*(a), (b)))
+#define ENSURE_AT_LEAST(a, b) (*(a) = MAX(*(a), (b)))
+
+#define DEBUG_PRINTF(fmt, ...) pr_debug(fmt, ##__VA_ARGS__)
+
+#define assert(cond) BUG_ON(!(cond))
+
+#endif
diff --git a/regex/nfa/accel.c b/regex/nfa/accel.c
new file mode 100644
index 000000000..2bc60945f
--- /dev/null
+++ b/regex/nfa/accel.c
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accel.h"
+#include "shufti.h"
+#include "truffle.h"
+#include "vermicelli.h"
+#include "ue2common.h"
+
+const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) {
+    assert(ISALIGNED_N(accel, alignof(union AccelAux)));
+    const u8 *rv;
+
+    switch (accel->accel_type) {
+    case ACCEL_NONE:
+        DEBUG_PRINTF("accel none %p %p\n", c, c_end);
+        return c;
+
+    case ACCEL_VERM:
+        DEBUG_PRINTF("accel verm %p %p\n", c, c_end);
+        if (c + 15 >= c_end) {
+            return c;
+        }
+
+        rv = vermicelliExec(accel->verm.c, 0, c, c_end);
+        break;
+
+    case ACCEL_VERM_NOCASE:
+        DEBUG_PRINTF("accel verm nc %p %p\n", c, c_end);
+        if (c + 15 >= c_end) {
+            return c;
+        }
+
+        rv = vermicelliExec(accel->verm.c, 1, c, c_end);
+        break;
+
+    case ACCEL_DVERM:
+        DEBUG_PRINTF("accel dverm %p %p\n", c, c_end);
+        if (c + 16 + 1 >= c_end) {
+            return c;
+        }
+
+        /* need to stop one early to get an accurate end state */
+        rv = vermicelliDoubleExec(accel->dverm.c1, accel->dverm.c2, 0, c,
+                                  c_end - 1);
+        break;
+
+    case ACCEL_DVERM_NOCASE:
+        DEBUG_PRINTF("accel dverm nc %p %p\n", c, c_end);
+        if (c + 16 + 1 >= c_end) {
+            return c;
+        }
+
+        /* need to stop one early to get an accurate end state */
+        rv = vermicelliDoubleExec(accel->dverm.c1, accel->dverm.c2, 1, c,
+                                  c_end - 1);
+        break;
+
+    case ACCEL_DVERM_MASKED:
+        DEBUG_PRINTF("accel dverm masked %p %p\n", c, c_end);
+        if (c + 16 + 1 >= c_end) {
+            return c;
+        }
+
+        /* need to stop one early to get an accurate end state */
+        rv = vermicelliDoubleMaskedExec(accel->dverm.c1, accel->dverm.c2,
+                                        accel->dverm.m1, accel->dverm.m2,
+                                        c, c_end - 1);
+        break;
+
+    case ACCEL_SHUFTI:
+        DEBUG_PRINTF("accel shufti %p %p\n", c, c_end);
+        if (c + 15 >= c_end) {
+            return c;
+        }
+
+        rv = shuftiExec(accel->shufti.lo, accel->shufti.hi, c, c_end);
+        break;
+
+    case ACCEL_TRUFFLE:
+        DEBUG_PRINTF("accel Truffle %p %p\n", c, c_end);
+        if (c + 15 >= c_end) {
+            return c;
+        }
+
+        rv = truffleExec(accel->truffle.mask1, accel->truffle.mask2, c, c_end);
+        break;
+
+    case ACCEL_DSHUFTI:
+        DEBUG_PRINTF("accel dshufti %p %p\n", c, c_end);
+        if (c + 15 + 1 >= c_end) {
+            return c;
+        }
+
+        /* need to stop one early to get an accurate end state */
+        rv = shuftiDoubleExec(accel->dshufti.lo1,
+                              accel->dshufti.hi1,
+                              accel->dshufti.lo2,
+                              accel->dshufti.hi2, c, c_end - 1);
+        break;
+
+    case ACCEL_RED_TAPE:
+        DEBUG_PRINTF("accel red tape %p %p\n", c, c_end);
+        rv = c_end;
+        break;
+
+
+    default:
+        assert(!"not here");
+        return c;
+    }
+
+    DEBUG_PRINTF("adjusting for offset %u\n", accel->generic.offset);
+    /* adjust offset to take into account the offset */
+    rv = MAX(c + accel->generic.offset, rv);
+    rv -= accel->generic.offset;
+
+    DEBUG_PRINTF("advanced %zd\n", rv - c);
+
+    return rv;
+}
diff --git a/regex/nfa/accel.h b/regex/nfa/accel.h
new file mode 100644
index 000000000..3a03d0596
--- /dev/null
+++ b/regex/nfa/accel.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Acceleration: data structures and common definitions.
+ */
+
+#ifndef ACCEL_H
+#define ACCEL_H
+
+#include "ue2common.h"
+
+/* run time defs */
+#define BAD_ACCEL_DIST      4
+#define SMALL_ACCEL_PENALTY 8
+#define BIG_ACCEL_PENALTY   32
+
+/// Minimum length of the scan buffer for us to attempt acceleration.
+#define ACCEL_MIN_LEN       16
+
+enum AccelType {
+    ACCEL_NONE,
+    ACCEL_VERM,
+    ACCEL_VERM_NOCASE,
+    ACCEL_DVERM,
+    ACCEL_DVERM_NOCASE,
+    ACCEL_RVERM,
+    ACCEL_RVERM_NOCASE,
+    ACCEL_RDVERM,
+    ACCEL_RDVERM_NOCASE,
+    ACCEL_REOD,
+    ACCEL_REOD_NOCASE,
+    ACCEL_RDEOD,
+    ACCEL_RDEOD_NOCASE,
+    ACCEL_SHUFTI,
+    ACCEL_DSHUFTI,
+    ACCEL_TRUFFLE,
+    ACCEL_RED_TAPE,
+    ACCEL_DVERM_MASKED,
+};
+
+/** \brief Structure for accel framework. */
+union AccelAux {
+    u8 accel_type;
+    struct {
+        u8 accel_type;
+        u8 offset;
+    } generic;
+    struct {
+        u8 accel_type;
+        u8 offset;
+        u8 c; // uppercase if nocase
+    } verm;
+    struct {
+        u8 accel_type;
+        u8 offset;
+        u8 c1; // uppercase if nocase
+        u8 c2; // uppercase if nocase
+        u8 m1; // masked variant
+        u8 m2; // masked variant
+    } dverm;
+    struct {
+        u8 accel_type;
+        u8 offset;
+        u8 c; // uppercase if nocase
+        u8 len;
+    } mverm;
+    struct {
+        u8 accel_type;
+        u8 offset;
+        u8 c; // uppercase if nocase
+        u8 len1;
+        u8 len2;
+    } mdverm;
+    struct {
+        u8 accel_type;
+        u8 offset;
+        m128 lo;
+        m128 hi;
+    } shufti;
+    struct {
+        u8 accel_type;
+        u8 offset;
+        m128 lo1;
+        m128 hi1;
+        m128 lo2;
+        m128 hi2;
+    } dshufti;
+    struct {
+        u8 accel_type;
+        u8 offset;
+        m128 mask1;
+        m128 mask2;
+    } truffle;
+};
+
+/**
+ * Runs the specified acceleration scheme between c and c_end, returns a point
+ * such that the acceleration scheme does not match before.
+ */
+const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end);
+
+#endif
diff --git a/regex/nfa/callback.h b/regex/nfa/callback.h
new file mode 100644
index 000000000..9bdaa8d14
--- /dev/null
+++ b/regex/nfa/callback.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief NFA Callback definitions, used at runtime.
+ */
+
+#ifndef NFA_CALLBACK_H
+#define NFA_CALLBACK_H
+
+#include "ue2common.h"
+
+/** \brief The type for an NFA callback.
+ *
+ * This is a function that takes as arguments the current start and end offsets
+ * where the match occurs, the id of the match and the context pointer that was
+ * passed into the NFA API function that executed the NFA.
+ *
+ * The start offset is the "start of match" (SOM) offset for the match. It is
+ * only provided by engines that natively support SOM tracking (e.g. Gough).
+ *
+ * The end offset will be the offset after the character that caused the match.
+ * Thus, if we have a buffer containing 'abc', then a pattern that matches an
+ * empty string will have an offset of 0, a pattern that matches 'a' will have
+ * an offset of 1, and a pattern that matches 'abc' will have an offset of 3,
+ * which will be a value that is 'beyond' the size of the buffer. That is, if
+ * we have n characters in the buffer, there are n+1 different potential
+ * offsets for matches.
+ *
+ * This function should return an int - currently the possible return values
+ * are 0, which means 'stop running the engine' or non-zero, which means
+ * 'continue matching'.
+ */
+typedef int (*NfaCallback)(u64a start, u64a end, ReportID id, void *context);
+
+/**
+ * standard \ref NfaCallback return value indicating that engine execution
+ * should continue. (any non-zero value will serve this purpose)
+ */
+#define MO_CONTINUE_MATCHING 1
+
+/**
+ * \ref NfaCallback return value indicating that engine execution should halt.
+ */
+#define MO_HALT_MATCHING 0
+
+#endif // NFA_CALLBACK_H
diff --git a/regex/nfa/castle.c b/regex/nfa/castle.c
new file mode 100644
index 000000000..7c158b31c
--- /dev/null
+++ b/regex/nfa/castle.c
@@ -0,0 +1,1149 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Castle: multi-tenant repeat engine, runtime code.
+ */
+
+#include "castle.h"
+
+#include "castle_internal.h"
+#include "nfa_api.h"
+#include "nfa_api_queue.h"
+#include "nfa_internal.h"
+#include "repeat.h"
+#include "shufti.h"
+#include "truffle.h"
+#include "vermicelli.h"
+#include "util/bitutils.h"
+#include "util/multibit.h"
+#include "util/partial_store.h"
+#include "ue2common.h"
+
+static really_inline
+const struct SubCastle *getSubCastle(const struct Castle *c, u32 num) {
+    assert(num < c->numRepeats);
+    const struct SubCastle *sub =
+        (const struct SubCastle *)((const char *)c + sizeof(struct Castle));
+    assert(ISALIGNED(sub));
+    return &sub[num];
+}
+
+static really_inline
+const struct RepeatInfo *getRepeatInfo(const struct SubCastle *sub) {
+    const struct RepeatInfo *repeatInfo =
+        (const struct RepeatInfo *)((const char *)sub + sub->repeatInfoOffset);
+    return repeatInfo;
+}
+
+static really_inline
+union RepeatControl *getControl(char *full_state, const struct SubCastle *sub) {
+    union RepeatControl *rctrl =
+        (union RepeatControl *)(full_state + sub->fullStateOffset);
+    assert(ISALIGNED(rctrl));
+    return rctrl;
+}
+
+static really_inline
+const union RepeatControl *getControlConst(const char *full_state,
+                                           const struct SubCastle *sub) {
+    const union RepeatControl *rctrl =
+        (const union RepeatControl *)(full_state + sub->fullStateOffset);
+    assert(ISALIGNED(rctrl));
+    return rctrl;
+}
+
+enum MatchMode {
+    CALLBACK_OUTPUT,
+    STOP_AT_MATCH,
+};
+
+static really_inline
+char subCastleReportCurrent(const struct Castle *c, struct mq *q,
+                            const u64a offset, const u32 subIdx) {
+    const struct SubCastle *sub = getSubCastle(c, subIdx);
+    const struct RepeatInfo *info = getRepeatInfo(sub);
+
+    union RepeatControl *rctrl = getControl(q->state, sub);
+    char *rstate = (char *)q->streamState + sub->streamStateOffset +
+                   info->packedCtrlSize;
+    enum RepeatMatch match =
+        repeatHasMatch(info, rctrl, rstate, offset);
+    DEBUG_PRINTF("repeatHasMatch returned %d\n", match);
+    if (match == REPEAT_MATCH) {
+        DEBUG_PRINTF("firing match at %llu for sub %u, report %u\n", offset,
+                     subIdx, sub->report);
+        if (q->cb(0, offset, sub->report, q->context) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING;
+        }
+    }
+
+    return MO_CONTINUE_MATCHING;
+}
+
+static really_inline
+int castleReportCurrent(const struct Castle *c, struct mq *q) {
+    const u64a offset = q_cur_offset(q);
+    DEBUG_PRINTF("offset=%llu\n", offset);
+
+    if (c->exclusive) {
+        u8 *active = (u8 *)q->streamState;
+        u8 *groups = active + c->groupIterOffset;
+        for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
+             i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
+            u8 *cur = active + i * c->activeIdxSize;
+            const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
+            DEBUG_PRINTF("subcastle %u\n", activeIdx);
+            if (subCastleReportCurrent(c, q,
+                    offset, activeIdx) == MO_HALT_MATCHING) {
+                return MO_HALT_MATCHING;
+            }
+        }
+    }
+
+    if (c->exclusive != PURE_EXCLUSIVE) {
+        const u8 *active = (const u8 *)q->streamState + c->activeOffset;
+        for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID);
+             i != MMB_INVALID; i = mmbit_iterate(active, c->numRepeats, i)) {
+            DEBUG_PRINTF("subcastle %u\n", i);
+            if (subCastleReportCurrent(c, q, offset, i) == MO_HALT_MATCHING) {
+                return MO_HALT_MATCHING;
+            }
+        }
+    }
+
+    return MO_CONTINUE_MATCHING;
+}
+
+static really_inline
+char subCastleInAccept(const struct Castle *c, struct mq *q,
+                       const ReportID report, const u64a offset,
+                       const u32 subIdx) {
+    const struct SubCastle *sub = getSubCastle(c, subIdx);
+
+    if (sub->report != report) {
+        return 0;
+    }
+    const struct RepeatInfo *info = getRepeatInfo(sub);
+
+    union RepeatControl *rctrl = getControl(q->state, sub);
+    char *rstate = (char *)q->streamState + sub->streamStateOffset +
+                   info->packedCtrlSize;
+    enum RepeatMatch match =
+        repeatHasMatch(info, rctrl, rstate, offset);
+    if (match == REPEAT_MATCH) {
+        DEBUG_PRINTF("in an accept\n");
+        return 1;
+    }
+
+    return 0;
+}
+
+static really_inline
+char castleInAccept(const struct Castle *c, struct mq *q,
+                    const ReportID report, const u64a offset) {
+    DEBUG_PRINTF("offset=%llu\n", offset);
+     /* ignore when just catching up due to full queue */
+    if (report == MO_INVALID_IDX) {
+        return 0;
+    }
+
+    if (c->exclusive) {
+        u8 *active = (u8 *)q->streamState;
+        u8 *groups = active + c->groupIterOffset;
+        for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
+             i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
+            u8 *cur = active + i * c->activeIdxSize;
+            const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
+            DEBUG_PRINTF("subcastle %u\n", activeIdx);
+            if (subCastleInAccept(c, q, report, offset, activeIdx)) {
+                return 1;
+            }
+        }
+    }
+
+    if (c->exclusive != PURE_EXCLUSIVE) {
+        const u8 *active = (const u8 *)q->streamState + c->activeOffset;
+        for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID);
+             i != MMB_INVALID; i = mmbit_iterate(active, c->numRepeats, i)) {
+            DEBUG_PRINTF("subcastle %u\n", i);
+            if (subCastleInAccept(c, q, report, offset, i)) {
+                return 1;
+            }
+        }
+    }
+
+    return 0;
+}
+
+static really_inline
+void subCastleDeactivateStaleSubs(const struct Castle *c, const u64a offset,
+                                  void *full_state, void *stream_state,
+                                  const u32 subIdx) {
+    const struct SubCastle *sub = getSubCastle(c, subIdx);
+    const struct RepeatInfo *info = getRepeatInfo(sub);
+
+    union RepeatControl *rctrl = getControl(full_state, sub);
+    char *rstate = (char *)stream_state + sub->streamStateOffset +
+                       info->packedCtrlSize;
+
+    if (repeatHasMatch(info, rctrl, rstate, offset) == REPEAT_STALE) {
+        DEBUG_PRINTF("sub %u is stale at offset %llu\n", subIdx, offset);
+        if (sub->exclusiveId < c->numRepeats) {
+            u8 *active = (u8 *)stream_state;
+            u8 *groups = active + c->groupIterOffset;
+            mmbit_unset(groups, c->numGroups, sub->exclusiveId);
+        } else {
+            u8 *active = (u8 *)stream_state + c->activeOffset;
+            mmbit_unset(active, c->numRepeats, subIdx);
+        }
+    }
+}
+
+static really_inline
+void castleDeactivateStaleSubs(const struct Castle *c, const u64a offset,
+                               void *full_state, void *stream_state) {
+    DEBUG_PRINTF("offset=%llu\n", offset);
+
+    if (!c->staleIterOffset) {
+        DEBUG_PRINTF("{no repeats can go stale}\n");
+        return; /* no subcastle can ever go stale */
+    }
+
+    if (c->exclusive) {
+        u8 *active = (u8 *)stream_state;
+        u8 *groups = active + c->groupIterOffset;
+        for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
+             i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
+            u8 *cur = active + i * c->activeIdxSize;
+            const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
+            DEBUG_PRINTF("subcastle %u\n", activeIdx);
+            subCastleDeactivateStaleSubs(c, offset, full_state,
+                                         stream_state, activeIdx);
+        }
+    }
+
+    if (c->exclusive != PURE_EXCLUSIVE) {
+        const u8 *active = (const u8 *)stream_state + c->activeOffset;
+        const struct mmbit_sparse_iter *it
+            = (const void *)((const char *)c + c->staleIterOffset);
+
+        struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES];
+        u32 numRepeats = c->numRepeats;
+        u32 idx = 0;
+
+        u32 i = mmbit_sparse_iter_begin(active, numRepeats, &idx, it, si_state);
+        while(i != MMB_INVALID) {
+            DEBUG_PRINTF("subcastle %u\n", i);
+            subCastleDeactivateStaleSubs(c, offset, full_state, stream_state, i);
+            i = mmbit_sparse_iter_next(active, numRepeats, i, &idx, it,
+                                       si_state);
+        }
+    }
+}
+
+static really_inline
+void castleProcessTop(const struct Castle *c, const u32 top, const u64a offset,
+                      void *full_state, void *stream_state,
+                      UNUSED char stale_checked) {
+    assert(top < c->numRepeats);
+
+    const struct SubCastle *sub = getSubCastle(c, top);
+    const struct RepeatInfo *info = getRepeatInfo(sub);
+    union RepeatControl *rctrl = getControl(full_state, sub);
+    char *rstate = (char *)stream_state + sub->streamStateOffset +
+                   info->packedCtrlSize;
+
+    char is_alive = 0;
+    u8 *active = (u8 *)stream_state;
+    if (sub->exclusiveId < c->numRepeats) {
+        u8 *groups = active + c->groupIterOffset;
+        active += sub->exclusiveId * c->activeIdxSize;
+        if (mmbit_set(groups, c->numGroups, sub->exclusiveId)) {
+            const u32 activeIdx = partial_load_u32(active, c->activeIdxSize);
+            is_alive = (activeIdx == top);
+        }
+
+        if (!is_alive) {
+            partial_store_u32(active, top, c->activeIdxSize);
+        }
+    } else {
+        active += c->activeOffset;
+        is_alive = mmbit_set(active, c->numRepeats, top);
+    }
+
+    if (!is_alive) {
+        DEBUG_PRINTF("first top for inactive repeat %u\n", top);
+    } else {
+        DEBUG_PRINTF("repeat %u is already alive\n", top);
+        // Caller should ensure we're not stale.
+        assert(!stale_checked
+               || repeatHasMatch(info, rctrl, rstate, offset) != REPEAT_STALE);
+
+        // Ignore duplicate top events.
+        u64a last = repeatLastTop(info, rctrl, rstate);
+
+        assert(last <= offset);
+        if (last == offset) {
+            DEBUG_PRINTF("dupe top at %llu\n", offset);
+            return;
+        }
+    }
+
+    repeatStore(info, rctrl, rstate, offset, is_alive);
+}
+
+static really_inline
+void subCastleFindMatch(const struct Castle *c, const u64a begin,
+                        const u64a end, void *full_state, void *stream_state,
+                        size_t *mloc, char *found, const u32 subIdx) {
+    const struct SubCastle *sub = getSubCastle(c, subIdx);
+    const struct RepeatInfo *info = getRepeatInfo(sub);
+    union RepeatControl *rctrl = getControl(full_state, sub);
+    char *rstate = (char *)stream_state + sub->streamStateOffset +
+                   info->packedCtrlSize;
+
+    u64a match = repeatNextMatch(info, rctrl, rstate, begin);
+    if (match == 0) {
+        DEBUG_PRINTF("no more matches for sub %u\n", subIdx);
+        if (sub->exclusiveId < c->numRepeats) {
+            u8 *groups = (u8 *)stream_state + c->groupIterOffset;
+            mmbit_unset(groups, c->numGroups, sub->exclusiveId);
+        } else {
+            u8 *active = (u8 *)stream_state + c->activeOffset;
+            mmbit_unset(active, c->numRepeats, subIdx);
+        }
+        return;
+    } else if (match > end) {
+        DEBUG_PRINTF("next match for sub %u at %llu is > horizon\n", subIdx,
+                     match);
+        return;
+    }
+    DEBUG_PRINTF("sub %u earliest match at %llu\n", subIdx, match);
+    size_t diff = match - begin;
+    if (!(*found) || diff < *mloc) {
+        *mloc = diff;
+        DEBUG_PRINTF("mloc=%zu\n", *mloc);
+    }
+    *found = 1;
+}
+
+static really_inline
+char castleFindMatch(const struct Castle *c, const u64a begin, const u64a end,
+                     void *full_state, void *stream_state, size_t *mloc) {
+    DEBUG_PRINTF("begin=%llu, end=%llu\n", begin, end);
+    assert(begin <= end);
+
+    if (begin == end) {
+        DEBUG_PRINTF("no work to do\n");
+        return 0;
+    }
+
+    char found = 0;
+    *mloc = 0;
+
+    if (c->exclusive) {
+        u8 *active = (u8 *)stream_state;
+        u8 *groups = active + c->groupIterOffset;
+        for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
+             i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
+            u8 *cur = active + i * c->activeIdxSize;
+            const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
+            DEBUG_PRINTF("subcastle %u\n", activeIdx);
+            subCastleFindMatch(c, begin, end, full_state, stream_state, mloc,
+                               &found, activeIdx);
+        }
+    }
+
+    if (c->exclusive != PURE_EXCLUSIVE) {
+        u8 *active = (u8 *)stream_state + c->activeOffset;
+        for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID);
+             i != MMB_INVALID;
+             i = mmbit_iterate(active, c->numRepeats, i)) {
+            DEBUG_PRINTF("subcastle %u\n", i);
+            subCastleFindMatch(c, begin, end, full_state, stream_state, mloc,
+                               &found, i);
+        }
+    }
+
+    return found;
+}
+
+static really_inline
+u64a subCastleNextMatch(const struct Castle *c, void *full_state,
+                        void *stream_state, const u64a loc,
+                        const u32 subIdx) {
+    DEBUG_PRINTF("subcastle %u\n", subIdx);
+    const struct SubCastle *sub = getSubCastle(c, subIdx);
+    const struct RepeatInfo *info = getRepeatInfo(sub);
+    const union RepeatControl *rctrl =
+        getControlConst(full_state, sub);
+    const char *rstate = (const char *)stream_state +
+                         sub->streamStateOffset +
+                         info->packedCtrlSize;
+
+    return repeatNextMatch(info, rctrl, rstate, loc);
+}
+
+static really_inline
+void set_matching(const struct Castle *c, const u64a match, u8 *active,
+                  u8 *matching, const u32 active_size, const u32 active_id,
+                  const u32 matching_id, u64a *offset, const u64a end) {
+    if (match == 0) {
+        DEBUG_PRINTF("no more matches\n");
+        mmbit_unset(active, active_size, active_id);
+    } else if (match > end) {
+        // If we had a local copy of the active mmbit, we could skip
+        // looking at this repeat again. But we don't, so we just move
+        // on.
+    } else if (match == *offset) {
+        mmbit_set(matching, c->numRepeats, matching_id);
+    } else if (match < *offset) {
+        // New minimum offset.
+        *offset = match;
+        mmbit_clear(matching, c->numRepeats);
+        mmbit_set(matching, c->numRepeats, matching_id);
+    }
+}
+
+static really_inline
+void subCastleMatchLoop(const struct Castle *c, void *full_state,
+                        void *stream_state, const u64a end,
+                        const u64a loc, u64a *offset) {
+    u8 *active = (u8 *)stream_state + c->activeOffset;
+    u8 *matching = full_state;
+    for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID);
+         i != MMB_INVALID; i = mmbit_iterate(active, c->numRepeats, i)) {
+        u64a match = subCastleNextMatch(c, full_state, stream_state, loc, i);
+        set_matching(c, match, active, matching, c->numRepeats, i,
+                     i, offset, end);
+    }
+}
+
+static really_inline
+char subCastleFireMatch(const struct Castle *c, const void *full_state,
+                        UNUSED const void *stream_state, NfaCallback cb,
+                        void *ctx, const u64a offset) {
+    const u8 *matching = full_state;
+
+    // Fire all matching sub-castles at this offset.
+    for (u32 i = mmbit_iterate(matching, c->numRepeats, MMB_INVALID);
+         i != MMB_INVALID;
+         i = mmbit_iterate(matching, c->numRepeats, i)) {
+        const struct SubCastle *sub = getSubCastle(c, i);
+        DEBUG_PRINTF("firing match at %llu for sub %u\n", offset, i);
+        if (cb(0, offset, sub->report, ctx) == MO_HALT_MATCHING) {
+            DEBUG_PRINTF("caller told us to halt\n");
+            return MO_HALT_MATCHING;
+        }
+    }
+
+    return MO_CONTINUE_MATCHING;
+}
+
+static really_inline
+char castleMatchLoop(const struct Castle *c, const u64a begin, const u64a end,
+                     void *full_state, void *stream_state, NfaCallback cb,
+                     void *ctx) {
+    DEBUG_PRINTF("begin=%llu, end=%llu\n", begin, end);
+    assert(begin <= end);
+
+    u8 *matching = full_state; // temp multibit
+
+    u64a loc = begin;
+    while (loc < end) {
+
+        // Find minimum next offset for the next match(es) from amongst our
+        // active sub-castles, and store the indices of the sub-castles that
+        // match at that offset in the 'matching' mmbit, which is in the
+        // full_state (scratch).
+
+        u64a offset = end; // min offset of next match
+        u32 activeIdx = 0;
+        mmbit_clear(matching, c->numRepeats);
+        if (c->exclusive) {
+            u8 *active = (u8 *)stream_state;
+            u8 *groups = active + c->groupIterOffset;
+            for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
+                 i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
+                u8 *cur = active + i * c->activeIdxSize;
+                activeIdx = partial_load_u32(cur, c->activeIdxSize);
+                u64a match = subCastleNextMatch(c, full_state, stream_state,
+                                                loc, activeIdx);
+                set_matching(c, match, groups, matching, c->numGroups, i,
+                             activeIdx, &offset, end);
+            }
+        }
+
+        if (c->exclusive != PURE_EXCLUSIVE) {
+            subCastleMatchLoop(c, full_state, stream_state,
+                               end, loc, &offset);
+        }
+        DEBUG_PRINTF("offset=%llu\n", offset);
+        if (!mmbit_any(matching, c->numRepeats)) {
+            DEBUG_PRINTF("no more matches\n");
+            break;
+        }
+
+        if (subCastleFireMatch(c, full_state, stream_state,
+                cb, ctx, offset) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING;
+        }
+        loc = offset;
+    }
+
+    return MO_CONTINUE_MATCHING;
+}
+
+static really_inline
+char castleScanVerm(const struct Castle *c, const u8 *buf, const size_t begin,
+                    const size_t end, size_t *loc) {
+    const u8 *ptr = vermicelliExec(c->u.verm.c, 0, buf + begin, buf + end);
+    if (ptr == buf + end) {
+        DEBUG_PRINTF("no escape found\n");
+        return 0;
+    }
+
+    assert(loc);
+    assert(ptr >= buf && ptr < buf + end);
+    *loc = (size_t)(ptr - buf);
+    DEBUG_PRINTF("escape found at offset %zu\n", *loc);
+    return 1;
+}
+
+static really_inline
+char castleScanNVerm(const struct Castle *c, const u8 *buf, const size_t begin,
+                     const size_t end, size_t *loc) {
+    const u8 *ptr = nvermicelliExec(c->u.verm.c, 0, buf + begin, buf + end);
+    if (ptr == buf + end) {
+        DEBUG_PRINTF("no escape found\n");
+        return 0;
+    }
+
+    assert(loc);
+    assert(ptr >= buf && ptr < buf + end);
+    *loc = (size_t)(ptr - buf);
+    DEBUG_PRINTF("escape found at offset %zu\n", *loc);
+    return 1;
+}
+
+static really_inline
+char castleScanShufti(const struct Castle *c, const u8 *buf, const size_t begin,
+                      const size_t end, size_t *loc) {
+    const m128 mask_lo = c->u.shuf.mask_lo;
+    const m128 mask_hi = c->u.shuf.mask_hi;
+    const u8 *ptr = shuftiExec(mask_lo, mask_hi, buf + begin, buf + end);
+    if (ptr == buf + end) {
+        DEBUG_PRINTF("no escape found\n");
+        return 0;
+    }
+
+    assert(loc);
+    assert(ptr >= buf && ptr < buf + end);
+    *loc = (size_t)(ptr - buf);
+    DEBUG_PRINTF("escape found at offset %zu\n", *loc);
+    return 1;
+}
+
+static really_inline
+char castleScanTruffle(const struct Castle *c, const u8 *buf, const size_t begin,
+                      const size_t end, size_t *loc) {
+    const u8 *ptr = truffleExec(c->u.truffle.mask1, c->u.truffle.mask2,
+                                buf + begin, buf + end);
+    if (ptr == buf + end) {
+        DEBUG_PRINTF("no escape found\n");
+        return 0;
+    }
+
+    assert(loc);
+    assert(ptr >= buf && ptr < buf + end);
+    *loc = (size_t)(ptr - buf);
+    DEBUG_PRINTF("escape found at offset %zu\n", *loc);
+    return 1;
+}
+
+static really_inline
+char castleScan(const struct Castle *c, const u8 *buf, const size_t begin,
+                const size_t end, size_t *loc) {
+    assert(begin <= end);
+
+    if (begin == end) {
+        return 0;
+    }
+
+    switch (c->type) {
+    case CASTLE_DOT:
+        // Nothing can stop a dot scan!
+        return 0;
+    case CASTLE_VERM:
+        return castleScanVerm(c, buf, begin, end, loc);
+    case CASTLE_NVERM:
+        return castleScanNVerm(c, buf, begin, end, loc);
+    case CASTLE_SHUFTI:
+        return castleScanShufti(c, buf, begin, end, loc);
+    case CASTLE_TRUFFLE:
+        return castleScanTruffle(c, buf, begin, end, loc);
+    default:
+        DEBUG_PRINTF("unknown scan type!\n");
+        assert(0);
+        return 0;
+    }
+}
+
+static really_inline
+char castleRevScanVerm(const struct Castle *c, const u8 *buf,
+                       const size_t begin, const size_t end, size_t *loc) {
+    const u8 *ptr = rvermicelliExec(c->u.verm.c, 0, buf + begin, buf + end);
+    if (ptr == buf + begin - 1) {
+        DEBUG_PRINTF("no escape found\n");
+        return 0;
+    }
+
+    assert(loc);
+    assert(ptr >= buf && ptr < buf + end);
+    *loc = (size_t)(ptr - buf);
+    DEBUG_PRINTF("escape found at offset %zu\n", *loc);
+    return 1;
+}
+
+static really_inline
+char castleRevScanNVerm(const struct Castle *c, const u8 *buf,
+                        const size_t begin, const size_t end, size_t *loc) {
+    const u8 *ptr = rnvermicelliExec(c->u.verm.c, 0, buf + begin, buf + end);
+    if (ptr == buf + begin - 1) {
+        DEBUG_PRINTF("no escape found\n");
+        return 0;
+    }
+
+    assert(loc);
+    assert(ptr >= buf && ptr < buf + end);
+    *loc = (size_t)(ptr - buf);
+    DEBUG_PRINTF("escape found at offset %zu\n", *loc);
+    return 1;
+}
+
+static really_inline
+char castleRevScanShufti(const struct Castle *c, const u8 *buf,
+                         const size_t begin, const size_t end, size_t *loc) {
+    const m128 mask_lo = c->u.shuf.mask_lo;
+    const m128 mask_hi = c->u.shuf.mask_hi;
+    const u8 *ptr = rshuftiExec(mask_lo, mask_hi, buf + begin, buf + end);
+    if (ptr == buf + begin - 1) {
+        DEBUG_PRINTF("no escape found\n");
+        return 0;
+    }
+
+    assert(loc);
+    assert(ptr >= buf && ptr < buf + end);
+    *loc = (size_t)(ptr - buf);
+    DEBUG_PRINTF("escape found at offset %zu\n", *loc);
+    return 1;
+}
+
+static really_inline
+char castleRevScanTruffle(const struct Castle *c, const u8 *buf,
+                          const size_t begin, const size_t end, size_t *loc) {
+    const u8 *ptr = rtruffleExec(c->u.truffle.mask1, c->u.truffle.mask2,
+                                 buf + begin, buf + end);
+    if (ptr == buf + begin - 1) {
+        DEBUG_PRINTF("no escape found\n");
+        return 0;
+    }
+
+    assert(loc);
+    assert(ptr >= buf && ptr < buf + end);
+    *loc = (size_t)(ptr - buf);
+    DEBUG_PRINTF("escape found at offset %zu\n", *loc);
+    return 1;
+}
+
+static really_inline
+char castleRevScan(const struct Castle *c, const u8 *buf, const size_t begin,
+                const size_t end, size_t *loc) {
+    assert(begin <= end);
+    DEBUG_PRINTF("scanning backwards over (%zu,%zu]\n", begin, end);
+    if (begin == end) {
+        return 0;
+    }
+
+    switch (c->type) {
+    case CASTLE_DOT:
+        // Nothing can stop a dot scan!
+        return 0;
+    case CASTLE_VERM:
+        return castleRevScanVerm(c, buf, begin, end, loc);
+    case CASTLE_NVERM:
+        return castleRevScanNVerm(c, buf, begin, end, loc);
+    case CASTLE_SHUFTI:
+        return castleRevScanShufti(c, buf, begin, end, loc);
+    case CASTLE_TRUFFLE:
+        return castleRevScanTruffle(c, buf, begin, end, loc);
+    default:
+        DEBUG_PRINTF("unknown scan type!\n");
+        assert(0);
+        return 0;
+    }
+}
+
+static really_inline
+void castleHandleEvent(const struct Castle *c, struct mq *q, const u64a sp,
+                       char stale_checked) {
+    const u32 event = q->items[q->cur].type;
+    switch (event) {
+    case MQE_TOP:
+        assert(0); // should be a numbered top
+        break;
+    case MQE_START:
+    case MQE_END:
+        break;
+    default:
+        assert(event >= MQE_TOP_FIRST);
+        assert(event < MQE_INVALID);
+        u32 top = event - MQE_TOP_FIRST;
+        DEBUG_PRINTF("top %u at offset %llu\n", top, sp);
+        castleProcessTop(c, top, sp, q->state, q->streamState, stale_checked);
+        break;
+    }
+}
+
+static really_inline
+void clear_repeats(const struct Castle *c, const struct mq *q, u8 *active) {
+    DEBUG_PRINTF("clearing active repeats due to escape\n");
+    if (c->exclusive) {
+        u8 *groups = (u8 *)q->streamState + c->groupIterOffset;
+        mmbit_clear(groups, c->numGroups);
+    }
+
+    if (c->exclusive != PURE_EXCLUSIVE) {
+        mmbit_clear(active, c->numRepeats);
+    }
+}
+
+static really_inline
+char nfaExecCastle_Q_i(const struct NFA *n, struct mq *q, s64a end,
+                       enum MatchMode mode) {
+    assert(n && q);
+    assert(n->type == CASTLE_NFA);
+
+    DEBUG_PRINTF("state=%p, streamState=%p\n", q->state, q->streamState);
+
+    const struct Castle *c = getImplNfa(n);
+
+    if (q->report_current) {
+        int rv = castleReportCurrent(c, q);
+        q->report_current = 0;
+        if (rv == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING;
+        }
+    }
+
+    if (q->cur == q->end) {
+        return 1;
+    }
+
+    u8 *active = (u8 *)q->streamState + c->activeOffset;// active multibit
+
+    assert(q->cur + 1 < q->end); // require at least two items
+    assert(q_cur_type(q) == MQE_START);
+    u64a sp = q_cur_offset(q);
+    q->cur++;
+    DEBUG_PRINTF("sp=%llu, abs_end=%llu\n", sp, end + q->offset);
+
+    while (q->cur < q->end) {
+        DEBUG_PRINTF("q item type=%d offset=%llu\n", q_cur_type(q),
+                     q_cur_offset(q));
+
+        char found = 0;
+        if (c->exclusive) {
+            u8 *groups = (u8 *)q->streamState + c->groupIterOffset;
+            found = mmbit_any(groups, c->numGroups);
+        }
+
+        if (!found && !mmbit_any(active, c->numRepeats)) {
+            DEBUG_PRINTF("no repeats active, skipping scan\n");
+            goto scan_done;
+        }
+
+        u64a ep = q_cur_offset(q);
+        ep = MIN(ep, q->offset + end);
+        if (sp < ep) {
+            size_t eloc = 0;
+            char escape_found = 0;
+            DEBUG_PRINTF("scanning from sp=%llu to ep=%llu\n", sp, ep);
+            assert(sp >= q->offset && ep >= q->offset);
+            if (castleScan(c, q->buffer, sp - q->offset, ep - q->offset,
+                           &eloc)) {
+                escape_found = 1;
+                ep = q->offset + eloc;
+                DEBUG_PRINTF("escape found at %llu\n", ep);
+                assert(ep >= sp);
+            }
+
+            assert(sp <= ep);
+
+            if (mode == STOP_AT_MATCH) {
+                size_t mloc;
+                if (castleFindMatch(c, sp, ep, q->state, q->streamState,
+                                    &mloc)) {
+                    DEBUG_PRINTF("storing match at %llu\n", sp + mloc);
+                    q->cur--;
+                    assert(q->cur < MAX_MQE_LEN);
+                    q->items[q->cur].type = MQE_START;
+                    q->items[q->cur].location = (s64a)(sp - q->offset) + mloc;
+                    return MO_MATCHES_PENDING;
+                }
+            } else {
+                assert(mode == CALLBACK_OUTPUT);
+                char rv = castleMatchLoop(c, sp, ep, q->state, q->streamState,
+                                          q->cb, q->context);
+                if (rv == MO_HALT_MATCHING) {
+                    return MO_HALT_MATCHING;
+                }
+                assert(rv == MO_CONTINUE_MATCHING);
+            }
+
+            if (escape_found) {
+                clear_repeats(c, q, active);
+            }
+        }
+
+    scan_done:
+        if (q_cur_loc(q) > end) {
+            q->cur--;
+            assert(q->cur < MAX_MQE_LEN);
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = end;
+            return MO_ALIVE;
+        }
+
+        sp = q_cur_offset(q);
+        castleHandleEvent(c, q, sp, 1);
+        q->cur++;
+    }
+
+    if (c->exclusive) {
+        u8 *groups = (u8 *)q->streamState + c->groupIterOffset;
+        if (mmbit_any_precise(groups, c->numGroups)) {
+            return 1;
+        }
+    }
+
+    return mmbit_any_precise(active, c->numRepeats);
+}
+
+char nfaExecCastle_Q(const struct NFA *n, struct mq *q, s64a end) {
+    DEBUG_PRINTF("entry\n");
+    return nfaExecCastle_Q_i(n, q, end, CALLBACK_OUTPUT);
+}
+
+char nfaExecCastle_Q2(const struct NFA *n, struct mq *q, s64a end) {
+    DEBUG_PRINTF("entry\n");
+    return nfaExecCastle_Q_i(n, q, end, STOP_AT_MATCH);
+}
+
+static
+s64a castleLastKillLoc(const struct Castle *c, struct mq *q) {
+    assert(q_cur_type(q) == MQE_START);
+    assert(q_last_type(q) == MQE_END);
+    s64a sp = q_cur_loc(q);
+    s64a ep = q_last_loc(q);
+
+    DEBUG_PRINTF("finding final squash in (%lld, %lld]\n", sp, ep);
+
+    size_t loc;
+
+    if (ep > 0) {
+        if (castleRevScan(c, q->buffer, sp > 0 ? sp : 0, ep, &loc)) {
+            return (s64a)loc;
+        }
+        ep = 0;
+    }
+
+    if (sp < 0) {
+        s64a hlen = q->hlength;
+
+        if (castleRevScan(c, q->history, sp + hlen, ep + hlen, &loc)) {
+            return (s64a)loc - hlen;
+        }
+        ep = 0;
+    }
+
+    return sp - 1; /* the repeats are never killed */
+}
+
+char nfaExecCastle_QR(const struct NFA *n, struct mq *q, ReportID report) {
+    assert(n && q);
+    assert(n->type == CASTLE_NFA);
+    DEBUG_PRINTF("entry\n");
+
+    if (q->cur == q->end) {
+        return 1;
+    }
+
+    assert(q->cur + 1 < q->end); /* require at least two items */
+    assert(q_cur_type(q) == MQE_START);
+
+    const struct Castle *c = getImplNfa(n);
+    u8 *active = (u8 *)q->streamState + c->activeOffset;
+
+    u64a end_offset = q_last_loc(q) + q->offset;
+    s64a last_kill_loc = castleLastKillLoc(c, q);
+    DEBUG_PRINTF("all repeats killed at %lld (exec range %lld, %lld)\n",
+                 last_kill_loc, q_cur_loc(q), q_last_loc(q));
+    assert(last_kill_loc < q_last_loc(q));
+
+    if (last_kill_loc != q_cur_loc(q) - 1) {
+        clear_repeats(c, q, active);
+    }
+
+    q->cur++; /* skip start event */
+
+    /* skip events prior to the repeats being squashed */
+    while (q_cur_loc(q) <= last_kill_loc) {
+        DEBUG_PRINTF("skipping moot event at %lld\n", q_cur_loc(q));
+        q->cur++;
+        assert(q->cur < q->end);
+    }
+
+    while (q->cur < q->end) {
+        DEBUG_PRINTF("q item type=%d offset=%llu\n", q_cur_type(q),
+                     q_cur_offset(q));
+        u64a sp = q_cur_offset(q);
+        castleHandleEvent(c, q, sp, 0);
+        q->cur++;
+    }
+
+    castleDeactivateStaleSubs(c, end_offset, q->state, q->streamState);
+
+    char found = 0;
+    if (c->exclusive) {
+        u8 *groups = (u8 *)q->streamState + c->groupIterOffset;
+        found = mmbit_any_precise(groups, c->numGroups);
+
+    }
+
+    if (!found && !mmbit_any_precise(active, c->numRepeats)) {
+        DEBUG_PRINTF("castle is dead\n");
+        return 0;
+    }
+
+    if (castleInAccept(c, q, report, end_offset)) {
+        return MO_MATCHES_PENDING;
+    }
+
+    return 1;
+}
+
+char nfaExecCastle_reportCurrent(const struct NFA *n, struct mq *q) {
+    assert(n && q);
+    assert(n->type == CASTLE_NFA);
+    DEBUG_PRINTF("entry\n");
+
+    const struct Castle *c = getImplNfa(n);
+    castleReportCurrent(c, q);
+    return 0;
+}
+
+char nfaExecCastle_inAccept(const struct NFA *n, ReportID report,
+                            struct mq *q) {
+    assert(n && q);
+    assert(n->type == CASTLE_NFA);
+    DEBUG_PRINTF("entry\n");
+
+    const struct Castle *c = getImplNfa(n);
+    return castleInAccept(c, q, report, q_cur_offset(q));
+}
+
+char nfaExecCastle_inAnyAccept(const struct NFA *n, struct mq *q) {
+    assert(n && q);
+    assert(n->type == CASTLE_NFA);
+    DEBUG_PRINTF("entry\n");
+
+    const struct Castle *c = getImplNfa(n);
+    const u64a offset = q_cur_offset(q);
+    DEBUG_PRINTF("offset=%llu\n", offset);
+
+    if (c->exclusive) {
+        u8 *active = (u8 *)q->streamState;
+        u8 *groups = active + c->groupIterOffset;
+        for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
+             i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
+            u8 *cur = active + i * c->activeIdxSize;
+            const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
+            DEBUG_PRINTF("subcastle %u\n", activeIdx);
+            const struct SubCastle *sub = getSubCastle(c, activeIdx);
+            if (subCastleInAccept(c, q, sub->report, offset, activeIdx)) {
+                return 1;
+            }
+        }
+    }
+
+    if (c->exclusive != PURE_EXCLUSIVE) {
+        const u8 *active = (const u8 *)q->streamState + c->activeOffset;
+        for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID);
+             i != MMB_INVALID; i = mmbit_iterate(active, c->numRepeats, i)) {
+            DEBUG_PRINTF("subcastle %u\n", i);
+            const struct SubCastle *sub = getSubCastle(c, i);
+            if (subCastleInAccept(c, q, sub->report, offset, i)) {
+                return 1;
+            }
+        }
+    }
+
+    return 0;
+}
+
+
+char nfaExecCastle_queueInitState(UNUSED const struct NFA *n, struct mq *q) {
+    assert(n && q);
+    assert(n->type == CASTLE_NFA);
+    DEBUG_PRINTF("entry\n");
+
+    const struct Castle *c = getImplNfa(n);
+    assert(q->streamState);
+    if (c->exclusive) {
+        u8 *groups = (u8 *)q->streamState + c->groupIterOffset;
+        mmbit_clear(groups, c->numGroups);
+    }
+
+    if (c->exclusive != PURE_EXCLUSIVE) {
+        u8 *active = (u8 *)q->streamState + c->activeOffset;
+        mmbit_clear(active, c->numRepeats);
+    }
+    return 0;
+}
+
+char nfaExecCastle_initCompressedState(const struct NFA *n, UNUSED u64a offset,
+                                       void *state, UNUSED u8 key) {
+    assert(n && state);
+    assert(n->type == CASTLE_NFA);
+    DEBUG_PRINTF("entry\n");
+
+    const struct Castle *c = getImplNfa(n);
+    if (c->exclusive) {
+        u8 *groups = (u8 *)state + c->groupIterOffset;
+        mmbit_clear(groups, c->numGroups);
+    }
+
+    if (c->exclusive != PURE_EXCLUSIVE) {
+        u8 *active = (u8 *)state + c->activeOffset;
+        mmbit_clear(active, c->numRepeats);
+    }
+    return 0;
+}
+
+static really_inline
+void subCastleQueueCompressState(const struct Castle *c, const u32 subIdx,
+                                 const struct mq *q, const u64a offset) {
+    const struct SubCastle *sub = getSubCastle(c, subIdx);
+    const struct RepeatInfo *info = getRepeatInfo(sub);
+    union RepeatControl *rctrl = getControl(q->state, sub);
+    char *packed = (char *)q->streamState + sub->streamStateOffset;
+    DEBUG_PRINTF("sub %u next match %llu\n", subIdx,
+                 repeatNextMatch(info, rctrl,
+                                 packed + info->packedCtrlSize, offset));
+    repeatPack(packed, info, rctrl, offset);
+}
+
+char nfaExecCastle_queueCompressState(const struct NFA *n, const struct mq *q,
+                                      s64a loc) {
+    assert(n && q);
+    assert(n->type == CASTLE_NFA);
+    DEBUG_PRINTF("entry, loc=%lld\n", loc);
+
+    const struct Castle *c = getImplNfa(n);
+
+    // Pack state for all active repeats.
+    const u64a offset = q->offset + loc;
+    DEBUG_PRINTF("offset=%llu\n", offset);
+    if (c->exclusive) {
+        u8 *active = (u8 *)q->streamState;
+        u8 *groups = active + c->groupIterOffset;
+        for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
+             i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
+            u8 *cur = active + i * c->activeIdxSize;
+            const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
+            DEBUG_PRINTF("packing state for sub %u\n", activeIdx);
+            subCastleQueueCompressState(c, activeIdx, q, offset);
+        }
+    }
+
+    if (c->exclusive != PURE_EXCLUSIVE) {
+        const u8 *active = (const u8 *)q->streamState + c->activeOffset;
+        for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID);
+             i != MMB_INVALID; i = mmbit_iterate(active, c->numRepeats, i)) {
+            DEBUG_PRINTF("packing state for sub %u\n", i);
+            subCastleQueueCompressState(c, i, q, offset);
+        }
+    }
+    return 0;
+}
+
+static really_inline
+void subCastleExpandState(const struct Castle *c, const u32 subIdx,
+                          void *dest, const void *src, const u64a offset) {
+    const struct SubCastle *sub = getSubCastle(c, subIdx);
+    const struct RepeatInfo *info = getRepeatInfo(sub);
+    DEBUG_PRINTF("unpacking state for sub %u\n", subIdx);
+    union RepeatControl *rctrl = getControl(dest, sub);
+    const char *packed = (const char *)src + sub->streamStateOffset;
+    repeatUnpack(packed, info, offset, rctrl);
+    DEBUG_PRINTF("sub %u next match %llu\n", subIdx,
+                 repeatNextMatch(info, rctrl,
+                                 packed + info->packedCtrlSize, offset));
+}
+
+char nfaExecCastle_expandState(const struct NFA *n, void *dest, const void *src,
+                               u64a offset, UNUSED u8 key) {
+    assert(n && dest && src);
+    assert(n->type == CASTLE_NFA);
+    DEBUG_PRINTF("entry, src=%p, dest=%p, offset=%llu\n", src, dest, offset);
+
+    const struct Castle *c = getImplNfa(n);
+
+    if (c->exclusive) {
+        const u8 *active = (const u8 *)src;
+        const u8 *groups = active + c->groupIterOffset;
+        for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
+             i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
+            const u8 *cur = active + i * c->activeIdxSize;
+            const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
+            subCastleExpandState(c, activeIdx, dest, src, offset);
+        }
+    }
+
+    if (c->exclusive != PURE_EXCLUSIVE) {
+        // Unpack state for all active repeats.
+        const u8 *active = (const u8 *)src + c->activeOffset;
+        for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID);
+             i != MMB_INVALID; i = mmbit_iterate(active, c->numRepeats, i)) {
+            subCastleExpandState(c, i, dest, src, offset);
+        }
+    }
+    return 0;
+}
diff --git a/regex/nfa/castle.h b/regex/nfa/castle.h
new file mode 100644
index 000000000..cc7496ca7
--- /dev/null
+++ b/regex/nfa/castle.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef NFA_CASTLE_H
+#define NFA_CASTLE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "ue2common.h"
+
+struct mq;
+struct NFA;
+
+char nfaExecCastle_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecCastle_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecCastle_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecCastle_reportCurrent(const struct NFA *n, struct mq *q);
+char nfaExecCastle_inAccept(const struct NFA *n, ReportID report,
+                            struct mq *q);
+char nfaExecCastle_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecCastle_queueInitState(const struct NFA *n, struct mq *q);
+char nfaExecCastle_initCompressedState(const struct NFA *n, u64a offset,
+                                       void *state, u8 key);
+char nfaExecCastle_queueCompressState(const struct NFA *nfa, const struct mq *q,
+                                      s64a loc);
+char nfaExecCastle_expandState(const struct NFA *nfa, void *dest,
+                               const void *src, u64a offset, u8 key);
+
+#define nfaExecCastle_testEOD NFA_API_NO_IMPL
+#define nfaExecCastle_B_Reverse NFA_API_NO_IMPL
+#define nfaExecCastle_zombie_status NFA_API_ZOMBIE_NO_IMPL
+
+#ifdef __cplusplus
+}
+
+#endif // __cplusplus
+
+#endif
diff --git a/regex/nfa/castle_internal.h b/regex/nfa/castle_internal.h
new file mode 100644
index 000000000..429c232ff
--- /dev/null
+++ b/regex/nfa/castle_internal.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Castle: multi-tenant repeat engine, data structures.
+ */
+
+#ifndef NFA_CASTLE_INTERNAL_H
+#define NFA_CASTLE_INTERNAL_H
+
+#include "ue2common.h"
+#include "repeat_internal.h"
+
+struct SubCastle {
+    ReportID report;        //!< report to raise on match
+    u32 fullStateOffset;    //!< offset within full state (scratch)
+    u32 streamStateOffset;  //!< offset within stream state
+    u32 repeatInfoOffset;   //!< offset of RepeatInfo structure
+                            //   relative to the start of SubCastle
+    u32 exclusiveId;        //!< exclusive group id of this SubCastle,
+                            //   set to the number of SubCastles in Castle
+                            //   if it is not exclusive
+};
+
+#define CASTLE_DOT 0
+#define CASTLE_VERM 1
+#define CASTLE_NVERM 2
+#define CASTLE_SHUFTI 3
+#define CASTLE_TRUFFLE 4
+
+enum ExclusiveType {
+    NOT_EXCLUSIVE,     //!< no subcastles are exclusive
+    EXCLUSIVE,         //!< a subset of subcastles are exclusive
+    PURE_EXCLUSIVE     //!< all subcastles are exclusive
+};
+
+/**
+ * \brief Castle engine structure.
+ *
+ * A Castle is a collection of repeats that all share the same character
+ * reachability.
+ *
+ * The whole engine is laid out in memory as:
+ *
+ * - struct NFA
+ * - struct Castle
+ * - struct SubCastle[numRepeats]
+ * - tables for sparse model repeats
+ * - sparse iterator for subcastles that may be stale
+ *
+ * Castle stores an "active repeats" multibit in stream state, followed by the
+ * packed repeat state for each SubCastle. If there are both exclusive and
+ * non-exclusive SubCastle groups, we use an active id for each exclusive group
+ * and a multibit for the non-exclusive group. We also store an "active
+ * exclusive groups" multibit for exclusive groups. If all SubCastles are mutual
+ * exclusive, we remove "active repeats" multibit from stream state.
+ * * Castle stream state:
+ * *
+ * * |---|
+ * * |   | active subengine id for exclusive group 1
+ * * |---|
+ * * |   | active subengine id for exclusive group 2(if necessary)
+ * * |---|
+ * * ...
+ * * |---|
+ * * |   | "active repeats" multibit for non-exclusive subcastles
+ * * |   | (if not all subcastles are exclusive)
+ * * |---|
+ * * |   | active multibit for exclusive groups
+ * * |   |
+ * * |---|
+ * * ||-|| common pool of stream state for exclusive group 1
+ * * ||-||
+ * * |---|
+ * * ||-|| common pool of stream state for exclusive group 2(if necessary)
+ * * ||-||
+ * * |---|
+ * * ...
+ * * |---|
+ * * |   | stream state for each non-exclusive subcastles
+ * * ...
+ * * |   |
+ * * |---|
+ *
+ * In full state (stored in scratch space) it stores a temporary multibit over
+ * the repeats (used by \ref castleMatchLoop), followed by the repeat control
+ * blocks for each SubCastle.
+ */
+struct ALIGN_AVX_DIRECTIVE Castle {
+    u32 numRepeats;         //!< number of repeats in Castle
+    u32 numGroups;          //!< number of exclusive groups
+    u8 type;                //!< tells us which scanning mechanism (below) to use
+    u8 exclusive;           //!< tells us if there are mutual exclusive SubCastles
+    u8 activeIdxSize;       //!< number of bytes in stream state to store
+                            // active SubCastle id for exclusive mode
+    u32 activeOffset;       //!< offset to active multibit for non-exclusive
+                            // SubCastles
+    u32 staleIterOffset;    //!< offset to a sparse iterator to check for stale
+                            // sub castles
+    u32 groupIterOffset;    //!< offset to a iterator to check the aliveness of
+                            // exclusive groups
+
+    union {
+        struct {
+            char c;
+        } verm;
+        struct {
+            m128 mask_lo;
+            m128 mask_hi;
+        } shuf;
+        struct {
+            m128 mask1;
+            m128 mask2;
+        } truffle;
+    } u;
+};
+
+#endif // NFA_CASTLE_INTERNAL_H
diff --git a/regex/nfa/gough.c b/regex/nfa/gough.c
new file mode 100644
index 000000000..eebd54345
--- /dev/null
+++ b/regex/nfa/gough.c
@@ -0,0 +1,1147 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "gough.h"
+
+#include "accel.h"
+#include "gough_internal.h"
+#include "mcclellan.h"
+#include "nfa_api.h"
+#include "nfa_api_queue.h"
+#include "nfa_internal.h"
+#include "util/bitutils.h"
+#include "util/compare.h"
+#include "util/simd_utils.h"
+#include "util/unaligned.h"
+#include "ue2common.h"
+#ifndef __KERNEL__
+#include <string.h>
+#else
+#include <linux/string.h>
+#endif
+
+#include "mcclellan_common_impl.h"
+
+#define GOUGH_SOM_EARLY (~0ULL)
+
+static really_inline
+void compressSomValue(u32 comp_slot_width, u64a curr_offset,
+                      void *dest_som_base, u32 i, u64a val) {
+    void *dest_som = (u8 *)dest_som_base + i * comp_slot_width;
+    /* gough does not initialise all slots, so may contain garbage */
+    u64a delta = curr_offset - val;
+    switch (comp_slot_width) {
+    case 2:
+        if (delta >= (u16)~0U) {
+            delta = GOUGH_SOM_EARLY;
+        }
+        unaligned_store_u16(dest_som, delta);
+        break;
+    case 4:
+        if (delta >= (u32)~0U) {
+            delta = GOUGH_SOM_EARLY;
+        }
+        unaligned_store_u32(dest_som, delta);
+        break;
+    case 8:
+        if (delta >= ~0ULL) {
+            delta = GOUGH_SOM_EARLY;
+        }
+        unaligned_store_u64a(dest_som, delta);
+        break;
+    default:
+        assert(0);
+    }
+}
+
+static really_inline
+u64a expandSomValue(u32 comp_slot_width, u64a curr_offset,
+                    const void *src_som_base, u32 i) {
+    /* Note: gough does not initialise all slots, so we may end up decompressing
+     * garbage */
+
+    const void *src_som = (const u8 *)src_som_base + i * comp_slot_width;
+    u64a val = 0;
+    switch (comp_slot_width) {
+    case 2:
+        val = unaligned_load_u16(src_som);
+        if (val == (u16)~0U) {
+            return GOUGH_SOM_EARLY;
+        }
+        break;
+    case 4:
+        val = unaligned_load_u32(src_som);
+        if (val == (u32)~0U) {
+            return GOUGH_SOM_EARLY;
+        }
+        break;
+    case 8:
+        val = unaligned_load_u64a(src_som);
+        if (val == ~0ULL) {
+            return GOUGH_SOM_EARLY;
+        }
+        break;
+
+    default:
+        assert(0);
+    }
+    return curr_offset - val;
+}
+
+static really_inline
+char doReports(NfaCallback cb, void *ctxt, const struct mcclellan *m,
+               const struct gough_som_info *som, u16 s, u64a loc,
+               char eod, u16 * const cached_accept_state,
+               u32 * const cached_accept_id, u32 * const cached_accept_som) {
+    DEBUG_PRINTF("reporting state = %hu, loc=%llu, eod %hhu\n",
+                 (u16)(s & STATE_MASK), loc, eod);
+
+    if (!eod && s == *cached_accept_state) {
+        u64a from = *cached_accept_som == INVALID_SLOT ? loc
+                                               : som->slots[*cached_accept_som];
+        if (cb(from, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+
+        return MO_CONTINUE_MATCHING; /* continue execution */
+    }
+
+    const struct mstate_aux *aux = get_aux(m, s);
+    size_t offset = eod ? aux->accept_eod : aux->accept;
+
+    assert(offset);
+    const struct gough_report_list *rl
+        = (const void *)((const char *)m + offset - sizeof(struct NFA));
+    assert(ISALIGNED(rl));
+
+    DEBUG_PRINTF("report list size %u\n", rl->count);
+    u32 count = rl->count;
+
+    if (!eod && count == 1) {
+        *cached_accept_state = s;
+        *cached_accept_id = rl->report[0].r;
+        *cached_accept_som = rl->report[0].som;
+
+        u64a from = *cached_accept_som == INVALID_SLOT ? loc
+                                               : som->slots[*cached_accept_som];
+        DEBUG_PRINTF("reporting %u, using som[%u]=%llu\n", rl->report[0].r,
+                     *cached_accept_som, from);
+        if (cb(from, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+
+        return MO_CONTINUE_MATCHING; /* continue execution */
+    }
+
+    for (u32 i = 0; i < count; i++) {
+        u32 slot = rl->report[i].som;
+        u64a from = slot == INVALID_SLOT ? loc : som->slots[slot];
+        DEBUG_PRINTF("reporting %u, using som[%u] = %llu\n",
+                     rl->report[i].r, slot, from);
+        if (cb(from, loc, rl->report[i].r, ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+    }
+
+    return MO_CONTINUE_MATCHING; /* continue execution */
+}
+
+#ifdef DUMP_SUPPORT
+static UNUSED
+const char *dump_op(u8 op) {
+    switch (op) {
+    case GOUGH_INS_END:
+        return "END";
+    case GOUGH_INS_MOV:
+        return "MOV";
+    case GOUGH_INS_NEW:
+        return "NEW";
+    case GOUGH_INS_MIN:
+        return "MIN";
+    default:
+        return "???";
+    }
+}
+#endif
+
+static really_inline
+void run_prog_i(UNUSED const struct NFA *nfa,
+                const struct gough_ins *pc, u64a som_offset,
+                struct gough_som_info *som) {
+    DEBUG_PRINTF("run prog at som_offset of %llu\n", som_offset);
+    while (1) {
+        assert((const u8 *)pc >= (const u8 *)nfa);
+        assert((const u8 *)pc < (const u8 *)nfa + nfa->length);
+        u32 dest = pc->dest;
+        u32 src = pc->src;
+        assert(pc->op == GOUGH_INS_END
+               || dest < (nfa->scratchStateSize - 16) / 8);
+        DEBUG_PRINTF("%s %u %u\n", dump_op(pc->op), dest, src);
+        switch (pc->op) {
+        case GOUGH_INS_END:
+            return;
+        case GOUGH_INS_MOV:
+            som->slots[dest] = som->slots[src];
+            break;
+        case GOUGH_INS_NEW:
+            /* note: c has already been advanced */
+            DEBUG_PRINTF("current offset %llu; adjust %u\n", som_offset,
+                         pc->src);
+            assert(som_offset >= pc->src);
+            som->slots[dest] = som_offset - pc->src;
+            break;
+        case GOUGH_INS_MIN:
+            /* TODO: shift all values along by one so that a normal min works
+             */
+            if (som->slots[src] == GOUGH_SOM_EARLY) {
+                som->slots[dest] = som->slots[src];
+            } else if (som->slots[dest] != GOUGH_SOM_EARLY) {
+                LIMIT_TO_AT_MOST(&som->slots[dest], som->slots[src]);
+            }
+            break;
+        default:
+            assert(0);
+            return;
+        }
+        DEBUG_PRINTF("dest slot[%u] = %llu\n", dest, som->slots[dest]);
+        ++pc;
+    }
+}
+
+static really_inline
+void run_prog(const struct NFA *nfa, const u32 *edge_prog_table,
+              const u8 *buf, u64a offAdj, const u8 *c, u32 edge_num,
+              struct gough_som_info *som) {
+    DEBUG_PRINTF("taking edge %u\n", edge_num);
+    u32 prog_offset = edge_prog_table[edge_num];
+    if (!prog_offset) {
+        DEBUG_PRINTF("no prog on edge\n");
+        return;
+    }
+
+    const struct gough_ins *pc = (const void *)((const u8 *)nfa + prog_offset);
+    u64a curr_offset = (u64a)(c - buf) + offAdj - 1;
+    run_prog_i(nfa, pc, curr_offset, som);
+}
+
+static never_inline
+void run_accel_prog(const struct NFA *nfa, const struct gough_accel *gacc,
+                    const u8 *buf, u64a offAdj, const u8 *c, const u8 *c2,
+                    struct gough_som_info *som) {
+    assert(gacc->prog_offset);
+    assert(c2 > c);
+
+    const struct gough_ins *pc
+        = (const void *)((const u8 *)nfa + gacc->prog_offset);
+    s64a margin_dist = gacc->margin_dist;
+
+    DEBUG_PRINTF("run accel after skip %lld margin; advanced %zd\n",
+                  margin_dist, c2 - c);
+
+    if (c2 - c <= 2 * margin_dist) {
+        while (c < c2) {
+            u64a curr_offset = (u64a)(c - buf) + offAdj;
+            run_prog_i(nfa, pc, curr_offset, som);
+            c++;
+        }
+    } else {
+        u64a curr_offset = (u64a)(c - buf) + offAdj;
+        for (s64a i = 0; i < margin_dist; i++) {
+            run_prog_i(nfa, pc, curr_offset + i, som);
+        }
+
+        curr_offset = (u64a)(c2 - buf) + offAdj - margin_dist;
+        for (s64a i = 0; i < margin_dist; i++) {
+            run_prog_i(nfa, pc, curr_offset + i, som);
+        }
+    }
+}
+
+static never_inline
+u16 goughEnableStarts(const struct mcclellan *m, u16 s, u64a som_offset,
+                      struct gough_som_info *som) {
+    DEBUG_PRINTF("top triggered while at %hu\n", s);
+    const struct mstate_aux *aux = get_aux(m, s);
+    DEBUG_PRINTF("now going to state %hu\n", aux->top);
+
+    const u32 *top_offsets = get_gough_top_offsets(m);
+    if (!top_offsets) {
+        return aux->top;
+    }
+
+    u32 prog_offset = top_offsets[s];
+    if (!prog_offset) {
+        return aux->top;
+    }
+
+    DEBUG_PRINTF("doing som for top\n");
+    const struct NFA *nfa
+        = (const struct NFA *)((const char *)m - sizeof(struct NFA));
+    const struct gough_ins *pc = (const void *)((const u8 *)nfa
+                                                + prog_offset);
+    run_prog_i(nfa, pc, som_offset, som);
+    return aux->top;
+}
+
+static really_inline
+char goughExec16_i(const struct mcclellan *m, struct gough_som_info *som,
+                   u16 *state, const u8 *buf, size_t len, u64a offAdj,
+                   NfaCallback cb, void *ctxt, const u8 **c_final,
+                   enum MatchMode mode) {
+    assert(ISALIGNED_N(state, 2));
+
+    u16 s = *state;
+    const struct NFA *nfa
+        = (const struct NFA *)((const char *)m - sizeof(struct NFA));
+    const u8 *c = buf, *c_end = buf + len;
+    const u16 *succ_table = (const u16 *)((const char *)m
+                                          + sizeof(struct mcclellan));
+    assert(ISALIGNED_N(succ_table, 2));
+    const u16 sherman_base = m->sherman_limit;
+    const char *sherman_base_offset
+        = (const char *)nfa + m->sherman_offset;
+    const u32 as = m->alphaShift;
+
+    s &= STATE_MASK;
+
+    u32 cached_accept_id = 0;
+    u16 cached_accept_state = 0;
+    u32 cached_accept_som = 0;
+
+    const u32 *edge_prog_table = (const u32 *)(get_gough(m) + 1);
+
+    DEBUG_PRINTF("s: %hu, len %zu\n", s, len);
+
+    const u8 *min_accel_offset = c;
+    if (!m->has_accel || len < ACCEL_MIN_LEN) {
+        min_accel_offset = c_end;
+        goto without_accel;
+    }
+
+    goto with_accel;
+
+without_accel:
+    while (c < min_accel_offset && s) {
+        u8 cprime = m->remap[*(c++)];
+        DEBUG_PRINTF("c: %02hhx cp:%02hhx (s=%hu)\n", *(c-1), cprime, s);
+
+        u32 edge_num = ((u32)s << as) + cprime;
+        run_prog(nfa, edge_prog_table, buf, offAdj, c, edge_num, som);
+        if (s < sherman_base) {
+            DEBUG_PRINTF("doing normal\n");
+            assert(s < m->state_count);
+            s = succ_table[edge_num];
+        } else {
+            const char *sherman_state
+                = findShermanState(m, sherman_base_offset, sherman_base, s);
+            DEBUG_PRINTF("doing sherman\n");
+            s = doSherman16(sherman_state, cprime, succ_table, as);
+        }
+        DEBUG_PRINTF("s: %hu (%hu)\n", s, (u16)(s & STATE_MASK));
+
+        if (mode != NO_MATCHES && (s & ACCEPT_FLAG)) {
+            if (mode == STOP_AT_MATCH) {
+                *state = s & STATE_MASK;
+                *c_final = c - 1;
+                return MO_CONTINUE_MATCHING;
+            }
+
+            u64a loc = (c - 1) - buf + offAdj + 1;
+            if (doReports(cb, ctxt, m, som, s & STATE_MASK, loc, 0,
+                                &cached_accept_state, &cached_accept_id,
+                                &cached_accept_som) == MO_HALT_MATCHING) {
+                return MO_HALT_MATCHING;
+            }
+        }
+
+        s &= STATE_MASK;
+    }
+
+with_accel:
+    while (c < c_end && s) {
+        u8 cprime = m->remap[*(c++)];
+        DEBUG_PRINTF("c: %02hhx cp:%02hhx (s=%hu)\n", *(c-1), cprime, s);
+
+        u32 edge_num = ((u32)s << as) + cprime;
+        run_prog(nfa, edge_prog_table, buf, offAdj, c, edge_num, som);
+        if (s < sherman_base) {
+            DEBUG_PRINTF("doing normal\n");
+            assert(s < m->state_count);
+            s = succ_table[edge_num];
+        } else {
+            const char *sherman_state
+                = findShermanState(m, sherman_base_offset, sherman_base, s);
+            DEBUG_PRINTF("doing sherman\n");
+            s = doSherman16(sherman_state, cprime, succ_table, as);
+        }
+        DEBUG_PRINTF("s: %hu (%hu)\n", s, (u16)(s & STATE_MASK));
+
+        if (mode != NO_MATCHES && (s & ACCEPT_FLAG)) {
+            if (mode == STOP_AT_MATCH) {
+                *state = s & STATE_MASK;
+                *c_final = c - 1;
+                return MO_CONTINUE_MATCHING;
+            }
+
+            u64a loc = (c - 1) - buf + offAdj + 1;
+
+            if (doReports(cb, ctxt, m, som, s & STATE_MASK, loc, 0,
+                          &cached_accept_state, &cached_accept_id,
+                          &cached_accept_som)
+                == MO_HALT_MATCHING) {
+                return MO_HALT_MATCHING;
+            }
+        } else if (s & ACCEL_FLAG) {
+            DEBUG_PRINTF("skipping\n");
+            const struct mstate_aux *this_aux = get_aux(m, s & STATE_MASK);
+            u32 accel_offset = this_aux->accel_offset;
+
+            assert(accel_offset >= m->aux_offset);
+            assert(accel_offset < m->sherman_offset);
+
+            const struct gough_accel *gacc
+                = (const void *)((const char *)m + accel_offset);
+            assert(!gacc->prog_offset == !gacc->margin_dist);
+            const u8 *c2 = run_accel(&gacc->accel, c, c_end);
+
+            if (c2 != c && gacc->prog_offset) {
+                run_accel_prog(nfa, gacc, buf, offAdj, c, c2, som);
+            }
+
+            if (c2 < min_accel_offset + BAD_ACCEL_DIST) {
+                min_accel_offset = c2 + BIG_ACCEL_PENALTY;
+            } else {
+                min_accel_offset = c2 + SMALL_ACCEL_PENALTY;
+            }
+
+            if (min_accel_offset >= c_end - ACCEL_MIN_LEN) {
+                min_accel_offset = c_end;
+            }
+
+            DEBUG_PRINTF("advanced %zd, next accel chance in %zd/%zd\n",
+                         c2 - c, min_accel_offset - c2, c_end - c2);
+
+            c = c2;
+            s &= STATE_MASK;
+            goto without_accel;
+        }
+
+        s &= STATE_MASK;
+    }
+
+    if (mode == STOP_AT_MATCH) {
+        *c_final = c_end;
+    }
+    *state = s;
+
+    return MO_CONTINUE_MATCHING;
+}
+
+static really_inline
+char goughExec8_i(const struct mcclellan *m, struct gough_som_info *som,
+                  u8 *state, const u8 *buf, size_t len, u64a offAdj,
+                  NfaCallback cb, void *ctxt, const u8 **c_final,
+                  enum MatchMode mode) {
+    u8 s = *state;
+    const u8 *c = buf, *c_end = buf + len;
+    const u8 *succ_table = (const u8 *)((const char *)m
+                                        + sizeof(struct mcclellan));
+    const u32 as = m->alphaShift;
+    const struct mstate_aux *aux;
+
+    const struct NFA *nfa
+        = (const struct NFA *)((const char *)m - sizeof(struct NFA));
+    aux = (const struct mstate_aux *)((const char *)nfa + m->aux_offset);
+
+    const u32 *edge_prog_table = (const u32 *)(get_gough(m) + 1);
+
+    u16 accel_limit = m->accel_limit_8;
+    u16 accept_limit = m->accept_limit_8;
+
+    u32 cached_accept_id = 0;
+    u16 cached_accept_state = 0;
+    u32 cached_accept_som = 0;
+
+    DEBUG_PRINTF("accel %hu, accept %hu\n", accel_limit, accept_limit);
+
+    DEBUG_PRINTF("s: %hhu, len %zu\n", s, len);
+
+    const u8 *min_accel_offset = c;
+    if (!m->has_accel || len < ACCEL_MIN_LEN) {
+        min_accel_offset = c_end;
+        goto without_accel;
+    }
+
+    goto with_accel;
+
+without_accel:
+    while (c < min_accel_offset && s) {
+        u8 cprime = m->remap[*(c++)];
+        DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx\n", *(c-1),
+                     ourisprint(*(c-1)) ? *(c-1) : '?', cprime);
+
+        u32 edge_num = ((u32)s << as) + cprime;
+
+        run_prog(nfa, edge_prog_table, buf, offAdj, c, edge_num, som);
+
+        s = succ_table[edge_num];
+        DEBUG_PRINTF("s: %hhu\n", s);
+
+        if (mode != NO_MATCHES && s >= accept_limit) {
+            if (mode == STOP_AT_MATCH) {
+                DEBUG_PRINTF("match - pausing\n");
+                *state = s;
+                *c_final = c - 1;
+                return MO_CONTINUE_MATCHING;
+            }
+
+            u64a loc = (c - 1) - buf + offAdj + 1;
+            if (doReports(cb, ctxt, m, som, s, loc, 0,
+                                &cached_accept_state, &cached_accept_id,
+                                &cached_accept_som)
+                       == MO_HALT_MATCHING) {
+                return MO_HALT_MATCHING;
+            }
+        }
+    }
+
+with_accel:
+    while (c < c_end && s) {
+        u8 cprime = m->remap[*(c++)];
+        DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx\n", *(c-1),
+                     ourisprint(*(c-1)) ? *(c-1) : '?', cprime);
+
+        u32 edge_num = ((u32)s << as) + cprime;
+
+        run_prog(nfa, edge_prog_table, buf, offAdj, c, edge_num, som);
+
+        s = succ_table[edge_num];
+        DEBUG_PRINTF("s: %hhu\n", s);
+
+        if (s >= accel_limit) { /* accept_limit >= accel_limit */
+            if (mode != NO_MATCHES && s >= accept_limit) {
+                if (mode == STOP_AT_MATCH) {
+                    DEBUG_PRINTF("match - pausing\n");
+                    *state = s;
+                    *c_final = c - 1;
+                    return MO_CONTINUE_MATCHING;
+                }
+
+                u64a loc = (c - 1) - buf + offAdj + 1;
+                if (doReports(cb, ctxt, m, som, s, loc, 0,
+                                    &cached_accept_state, &cached_accept_id,
+                                    &cached_accept_som)
+                           == MO_HALT_MATCHING) {
+                    return MO_HALT_MATCHING;
+                }
+            } else if (aux[s].accel_offset) {
+                DEBUG_PRINTF("skipping\n");
+
+                const struct gough_accel *gacc
+                    = (const void *)((const char *)m + aux[s].accel_offset);
+                const u8 *c2 = run_accel(&gacc->accel, c, c_end);
+
+                if (c2 != c && gacc->prog_offset) {
+                    run_accel_prog(nfa, gacc, buf, offAdj, c, c2, som);
+                }
+
+                if (c2 < min_accel_offset + BAD_ACCEL_DIST) {
+                    min_accel_offset = c2 + BIG_ACCEL_PENALTY;
+                } else {
+                    min_accel_offset = c2 + SMALL_ACCEL_PENALTY;
+                }
+
+                if (min_accel_offset >= c_end - ACCEL_MIN_LEN) {
+                    min_accel_offset = c_end;
+                }
+
+                DEBUG_PRINTF("advanced %zd, next accel chance in %zd/%zd\n",
+                             c2 - c, min_accel_offset - c2, c_end - c2);
+
+                c = c2;
+                goto without_accel;
+            }
+        }
+    }
+
+    *state = s;
+    if (mode == STOP_AT_MATCH) {
+        *c_final = c_end;
+    }
+    return MO_CONTINUE_MATCHING;
+}
+
+static never_inline
+char goughExec8_i_ni(const struct mcclellan *m, struct gough_som_info *som,
+                     u8 *state, const u8 *buf, size_t len, u64a offAdj,
+                     NfaCallback cb, void *ctxt, const u8 **final_point,
+                     enum MatchMode mode) {
+    return goughExec8_i(m, som, state, buf, len, offAdj, cb, ctxt, final_point,
+                        mode);
+}
+
+static never_inline
+char goughExec16_i_ni(const struct mcclellan *m, struct gough_som_info *som,
+                      u16 *state, const u8 *buf, size_t len, u64a offAdj,
+                      NfaCallback cb, void *ctxt, const u8 **final_point,
+                      enum MatchMode mode) {
+    return goughExec16_i(m, som, state, buf, len, offAdj, cb, ctxt, final_point,
+                         mode);
+}
+
+static really_inline
+struct gough_som_info *getSomInfo(char *state_base) {
+    return (struct gough_som_info *)(state_base + 16);
+}
+
+static really_inline
+const struct gough_som_info *getSomInfoConst(const char *state_base) {
+    return (const struct gough_som_info *)(state_base + 16);
+}
+
+static really_inline
+char nfaExecGough8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
+                      const u8 *hend, NfaCallback cb, void *context,
+                      struct mq *q, s64a end, enum MatchMode mode) {
+    DEBUG_PRINTF("enter\n");
+    struct gough_som_info *som = getSomInfo(q->state);
+    assert(n->type == GOUGH_NFA_8);
+    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(n);
+    s64a sp;
+    u8 s = *(u8 *)q->state;
+
+    if (q->report_current) {
+        assert(s);
+        assert(s >= m->accept_limit_8);
+
+        u32 cached_accept_id = 0;
+        u16 cached_accept_state = 0;
+        u32 cached_accept_som = 0;
+
+        int rv = doReports(cb, context, m, som, s, q_cur_offset(q), 0,
+                           &cached_accept_state, &cached_accept_id,
+                           &cached_accept_som);
+
+        q->report_current = 0;
+
+        if (rv == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING;
+        }
+    }
+
+    sp = q_cur_loc(q);
+    q->cur++;
+
+    const u8 *cur_buf = sp < 0 ? hend : buffer;
+
+    if (mode != NO_MATCHES && q->items[q->cur - 1].location > end) {
+        /* this is as far as we go */
+        q->cur--;
+        q->items[q->cur].type = MQE_START;
+        q->items[q->cur].location = end;
+        *(u8 *)q->state = s;
+        return MO_ALIVE;
+    }
+
+    while (1) {
+        DEBUG_PRINTF("%s @ %llu [som %llu]\n",
+                     q->items[q->cur].type == MQE_TOP ? "TOP" :
+                     q->items[q->cur].type == MQE_END ? "END" : "???",
+                     q->items[q->cur].location + offset, q->items[q->cur].som);
+        assert(q->cur < q->end);
+        s64a ep = q->items[q->cur].location;
+        if (mode != NO_MATCHES) {
+            ep = MIN(ep, end);
+        }
+
+        assert(ep >= sp);
+        DEBUG_PRINTF("run to %lld from %lld\n", ep, sp);
+
+        s64a local_ep = ep;
+        if (sp < 0) {
+            local_ep = MIN(0, ep);
+        }
+
+        const u8 *final_look;
+        if (goughExec8_i_ni(m, som, &s, cur_buf + sp, local_ep - sp,
+                            offset + sp, cb, context, &final_look, mode)
+            == MO_HALT_MATCHING) {
+            *(u8 *)q->state = 0;
+            return 0;
+        }
+        if (mode == STOP_AT_MATCH && final_look != cur_buf + local_ep) {
+            /* found a match */
+            DEBUG_PRINTF("found a match\n");
+            assert(q->cur);
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = final_look - cur_buf + 1; /* due to
+                                                                   * early -1 */
+            *(u8 *)q->state = s;
+            return MO_MATCHES_PENDING;
+        }
+
+        assert(q->cur);
+        if (mode != NO_MATCHES && q->items[q->cur].location > end) {
+            /* this is as far as we go */
+            assert(q->cur);
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = end;
+            *(u8 *)q->state = s;
+            return MO_ALIVE;
+        }
+
+        sp = local_ep;
+
+        if (sp == 0) {
+            cur_buf = buffer;
+        }
+
+        if (sp != ep) {
+            continue;
+        }
+
+        switch (q->items[q->cur].type) {
+        case MQE_TOP:
+            assert(!s || sp + offset > 0);
+            if (sp + offset == 0) {
+                s = (u8)m->start_anchored;
+                break;
+            }
+            s = goughEnableStarts(m, s, q->items[q->cur].som, som);
+            break;
+        case MQE_END:
+            *(u8 *)q->state = s;
+            q->cur++;
+            return s ? MO_ALIVE : 0;
+        default:
+            assert(!"invalid queue event");
+        }
+
+        q->cur++;
+    }
+}
+
+
+static really_inline
+char nfaExecGough16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
+                       const u8 *hend, NfaCallback cb, void *context,
+                       struct mq *q, s64a end, enum MatchMode mode) {
+    struct gough_som_info *som = getSomInfo(q->state);
+    assert(n->type == GOUGH_NFA_16);
+    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(n);
+    s64a sp;
+
+    assert(ISALIGNED_N(q->state, 2));
+    u16 s = *(u16 *)q->state;
+
+    if (q->report_current) {
+        assert(s);
+        assert(get_aux(m, s)->accept);
+
+        u32 cached_accept_id = 0;
+        u16 cached_accept_state = 0;
+        u32 cached_accept_som = 0;
+
+        int rv = doReports(cb, context, m, som, s, q_cur_offset(q), 0,
+                                 &cached_accept_state, &cached_accept_id,
+                                 &cached_accept_som);
+
+        q->report_current = 0;
+
+        if (rv == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING;
+        }
+    }
+
+    sp = q_cur_loc(q);
+    q->cur++;
+
+    const u8 *cur_buf = sp < 0 ? hend : buffer;
+
+    assert(q->cur);
+    if (mode != NO_MATCHES && q->items[q->cur - 1].location > end) {
+        /* this is as far as we go */
+        q->cur--;
+        q->items[q->cur].type = MQE_START;
+        q->items[q->cur].location = end;
+        *(u16 *)q->state = s;
+        return MO_ALIVE;
+    }
+
+    while (1) {
+        assert(q->cur < q->end);
+        s64a ep = q->items[q->cur].location;
+        if (mode != NO_MATCHES) {
+            ep = MIN(ep, end);
+        }
+
+        assert(ep >= sp);
+
+        s64a local_ep = ep;
+        if (sp < 0) {
+            local_ep = MIN(0, ep);
+        }
+
+        /* do main buffer region */
+        const u8 *final_look;
+        if (goughExec16_i_ni(m, som, &s, cur_buf + sp, local_ep - sp,
+                             offset + sp, cb, context, &final_look, mode)
+            == MO_HALT_MATCHING) {
+            *(u16 *)q->state = 0;
+            return 0;
+        }
+        if (mode == STOP_AT_MATCH && final_look != cur_buf + local_ep) {
+            /* this is as far as we go */
+            assert(q->cur);
+            DEBUG_PRINTF("state %hu final_look %zd\n", s,
+                          final_look - cur_buf);
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = final_look - cur_buf + 1; /* due to
+                                                                   * early -1 */
+            *(u16 *)q->state = s;
+            return MO_MATCHES_PENDING;
+        }
+
+        assert(q->cur);
+        if (mode != NO_MATCHES && q->items[q->cur].location > end) {
+            /* this is as far as we go */
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = end;
+            *(u16 *)q->state = s;
+            return MO_ALIVE;
+        }
+
+        sp = local_ep;
+
+        if (sp == 0) {
+            cur_buf = buffer;
+        }
+
+        if (sp != ep) {
+            continue;
+        }
+
+        switch (q->items[q->cur].type) {
+        case MQE_TOP:
+            assert(!s || sp + offset > 0);
+            if (sp + offset == 0) {
+                s = m->start_anchored;
+                break;
+            }
+            s = goughEnableStarts(m, s, q->items[q->cur].som, som);
+            break;
+        case MQE_END:
+            *(u16 *)q->state = s;
+            q->cur++;
+            return s ? MO_ALIVE : 0;
+        default:
+            assert(!"invalid queue event");
+        }
+
+        q->cur++;
+    }
+}
+
+char nfaExecGough8_Q(const struct NFA *n, struct mq *q, s64a end) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == GOUGH_NFA_8);
+    const u8 *hend = q->history + q->hlength;
+
+    return nfaExecGough8_Q2i(n, offset, buffer, hend, cb, context, q, end,
+                            CALLBACK_OUTPUT);
+}
+
+char nfaExecGough16_Q(const struct NFA *n, struct mq *q, s64a end) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == GOUGH_NFA_16);
+    const u8 *hend = q->history + q->hlength;
+
+    return nfaExecGough16_Q2i(n, offset, buffer, hend, cb, context, q, end,
+                              CALLBACK_OUTPUT);
+}
+
+char nfaExecGough8_Q2(const struct NFA *n, struct mq *q, s64a end) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == GOUGH_NFA_8);
+    const u8 *hend = q->history + q->hlength;
+
+    return nfaExecGough8_Q2i(n, offset, buffer, hend, cb, context, q, end,
+                            STOP_AT_MATCH);
+}
+
+char nfaExecGough16_Q2(const struct NFA *n, struct mq *q, s64a end) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == GOUGH_NFA_16);
+    const u8 *hend = q->history + q->hlength;
+
+    return nfaExecGough16_Q2i(n, offset, buffer, hend, cb, context, q, end,
+                              STOP_AT_MATCH);
+}
+
+char nfaExecGough8_QR(const struct NFA *n, struct mq *q, ReportID report) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == GOUGH_NFA_8);
+    const u8 *hend = q->history + q->hlength;
+
+    char rv = nfaExecGough8_Q2i(n, offset, buffer, hend, cb, context, q,
+                                0 /* end */, NO_MATCHES);
+    if (rv && nfaExecMcClellan8_inAccept(n, report, q)) {
+        return MO_MATCHES_PENDING;
+    } else {
+        return rv;
+    }
+}
+
+char nfaExecGough16_QR(const struct NFA *n, struct mq *q, ReportID report) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == GOUGH_NFA_16);
+    const u8 *hend = q->history + q->hlength;
+
+    char rv = nfaExecGough16_Q2i(n, offset, buffer, hend, cb, context, q,
+                                 0 /* end */, NO_MATCHES);
+
+    if (rv && nfaExecMcClellan16_inAccept(n, report, q)) {
+        return MO_MATCHES_PENDING;
+    } else {
+        return rv;
+    }
+}
+
+char nfaExecGough8_initCompressedState(const struct NFA *nfa, u64a offset,
+                                       void *state, UNUSED u8 key) {
+    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(nfa);
+    memset(state, 0, nfa->streamStateSize);
+    u8 s = offset ? m->start_floating : m->start_anchored;
+    if (s) {
+        *(u8 *)state = s;
+        return 1;
+    }
+    return 0;
+}
+
+char nfaExecGough16_initCompressedState(const struct NFA *nfa, u64a offset,
+                                        void *state, UNUSED u8 key) {
+    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(nfa);
+    memset(state, 0, nfa->streamStateSize);
+    u16 s = offset ? m->start_floating : m->start_anchored;
+    if (s) {
+        unaligned_store_u16(state, s);
+        return 1;
+    }
+    return 0;
+}
+
+
+char nfaExecGough8_reportCurrent(const struct NFA *n, struct mq *q) {
+    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(n);
+    NfaCallback cb = q->cb;
+    void *ctxt = q->context;
+    u8 s = *(u8 *)q->state;
+    u64a offset = q_cur_offset(q);
+    struct gough_som_info *som = getSomInfo(q->state);
+    assert(q_cur_type(q) == MQE_START);
+    assert(s);
+
+    if (s >= m->accept_limit_8) {
+        u32 cached_accept_id = 0;
+        u16 cached_accept_state = 0;
+        u32 cached_accept_som = 0;
+
+        doReports(cb, ctxt, m, som, s, offset, 0, &cached_accept_state,
+                        &cached_accept_id, &cached_accept_som);
+    }
+
+    return 0;
+}
+
+char nfaExecGough16_reportCurrent(const struct NFA *n, struct mq *q) {
+    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(n);
+    NfaCallback cb = q->cb;
+    void *ctxt = q->context;
+    u16 s = *(u16 *)q->state;
+    const struct mstate_aux *aux = get_aux(m, s);
+    u64a offset = q_cur_offset(q);
+    struct gough_som_info *som = getSomInfo(q->state);
+    assert(q_cur_type(q) == MQE_START);
+    DEBUG_PRINTF("state %hu\n", s);
+    assert(s);
+
+    if (aux->accept) {
+        u32 cached_accept_id = 0;
+        u16 cached_accept_state = 0;
+        u32 cached_accept_som = 0;
+
+        doReports(cb, ctxt, m, som, s, offset, 0, &cached_accept_state,
+                        &cached_accept_id, &cached_accept_som);
+    }
+
+    return 0;
+}
+
+char nfaExecGough8_inAccept(const struct NFA *n, ReportID report,
+                            struct mq *q) {
+    return nfaExecMcClellan8_inAccept(n, report, q);
+}
+
+char nfaExecGough16_inAccept(const struct NFA *n, ReportID report,
+                             struct mq *q) {
+    return nfaExecMcClellan16_inAccept(n, report, q);
+}
+
+char nfaExecGough8_inAnyAccept(const struct NFA *n, struct mq *q) {
+    return nfaExecMcClellan8_inAnyAccept(n, q);
+}
+
+char nfaExecGough16_inAnyAccept(const struct NFA *n, struct mq *q) {
+    return nfaExecMcClellan16_inAnyAccept(n, q);
+}
+
+static
+char goughCheckEOD(const struct NFA *nfa, u16 s,
+                   const struct gough_som_info *som,
+                   u64a offset, NfaCallback cb, void *ctxt) {
+    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(nfa);
+    const struct mstate_aux *aux = get_aux(m, s);
+
+    if (!aux->accept_eod) {
+        return MO_CONTINUE_MATCHING;
+    }
+    return doReports(cb, ctxt, m, som, s, offset, 1, NULL, NULL, NULL);
+}
+
+char nfaExecGough8_testEOD(const struct NFA *nfa, const char *state,
+                           UNUSED const char *streamState, u64a offset,
+                           NfaCallback callback, void *context) {
+    const struct gough_som_info *som = getSomInfoConst(state);
+    return goughCheckEOD(nfa, *(const u8 *)state, som, offset, callback,
+                         context);
+}
+
+char nfaExecGough16_testEOD(const struct NFA *nfa, const char *state,
+                            UNUSED const char *streamState, u64a offset,
+                            NfaCallback callback, void *context) {
+    assert(ISALIGNED_N(state, 8));
+    const struct gough_som_info *som = getSomInfoConst(state);
+    return goughCheckEOD(nfa, *(const u16 *)state, som, offset, callback,
+                         context);
+}
+
+char nfaExecGough8_queueInitState(UNUSED const struct NFA *nfa, struct mq *q) {
+    memset(q->state, 0, nfa->scratchStateSize);
+    return 0;
+}
+
+char nfaExecGough16_queueInitState(UNUSED const struct NFA *nfa, struct mq *q) {
+    memset(q->state, 0, nfa->scratchStateSize);
+    assert(ISALIGNED_N(q->state, 2));
+    return 0;
+}
+
+static really_inline
+void compSomSpace(const struct NFA *nfa, u8 *dest_som_base,
+                  const struct gough_som_info *src, u64a curr_offset) {
+    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(nfa);
+    const struct gough_info *gi = get_gough(m);
+    u32 count = gi->stream_som_loc_count;
+    u32 width = gi->stream_som_loc_width;
+
+    for (u32 i = 0; i < count; i++) {
+        compressSomValue(width, curr_offset, dest_som_base, i, src->slots[i]);
+    }
+}
+
+static really_inline
+void expandSomSpace(const struct NFA *nfa, struct gough_som_info *som,
+                    const u8 *src_som_base, u64a curr_offset) {
+    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(nfa);
+    const struct gough_info *gi = get_gough(m);
+    u32 count = gi->stream_som_loc_count;
+    u32 width = gi->stream_som_loc_width;
+
+    for (u32 i = 0; i < count; i++) {
+        som->slots[i] = expandSomValue(width, curr_offset, src_som_base, i);
+    }
+}
+
+char nfaExecGough8_queueCompressState(const struct NFA *nfa, const struct mq *q,
+                                     s64a loc) {
+    void *dest = q->streamState;
+    const void *src = q->state;
+
+    *(u8 *)dest = *(const u8 *)src;
+    compSomSpace(nfa, (u8 *)dest + 1, getSomInfoConst(src), q->offset + loc);
+    return 0;
+}
+
+char nfaExecGough8_expandState(const struct NFA *nfa, void *dest,
+                              const void *src, u64a offset, UNUSED u8 key) {
+    *(u8 *)dest = *(const u8 *)src;
+    expandSomSpace(nfa, getSomInfo(dest), (const u8 *)src + 1, offset);
+    return 0;
+}
+
+char nfaExecGough16_queueCompressState(const struct NFA *nfa,
+                                       const struct mq *q, s64a loc) {
+    void *dest = q->streamState;
+    const void *src = q->state;
+
+    assert(ISALIGNED_N(src, 2));
+    unaligned_store_u16(dest, *(const u16 *)(src));
+    compSomSpace(nfa, (u8 *)dest + 2, getSomInfoConst(src), q->offset + loc);
+    return 0;
+}
+
+char nfaExecGough16_expandState(const struct NFA *nfa, void *dest,
+                               const void *src, u64a offset, UNUSED u8 key) {
+    assert(ISALIGNED_N(dest, 2));
+    *(u16 *)dest = unaligned_load_u16(src);
+    expandSomSpace(nfa, getSomInfo(dest), (const u8 *)src + 2, offset);
+    return 0;
+}
diff --git a/regex/nfa/gough.h b/regex/nfa/gough.h
new file mode 100644
index 000000000..a7f488923
--- /dev/null
+++ b/regex/nfa/gough.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef GOUGH_H
+#define GOUGH_H
+
+#include "callback.h"
+#include "ue2common.h"
+
+struct NFA;
+struct mq;
+
+// 8-bit Gough
+
+char nfaExecGough8_testEOD(const struct NFA *nfa, const char *state,
+                           const char *streamState, u64a offset,
+                           NfaCallback callback, void *context);
+char nfaExecGough8_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecGough8_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecGough8_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecGough8_reportCurrent(const struct NFA *n, struct mq *q);
+char nfaExecGough8_inAccept(const struct NFA *n, ReportID report, struct mq *q);
+char nfaExecGough8_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecGough8_queueInitState(const struct NFA *n, struct mq *q);
+char nfaExecGough8_initCompressedState(const struct NFA *n, u64a offset,
+                                       void *state, u8 key);
+char nfaExecGough8_queueCompressState(const struct NFA *nfa, const struct mq *q,
+                                      s64a loc);
+char nfaExecGough8_expandState(const struct NFA *nfa, void *dest,
+                               const void *src, u64a offset, u8 key);
+
+#define nfaExecGough8_B_Reverse NFA_API_NO_IMPL
+#define nfaExecGough8_zombie_status NFA_API_ZOMBIE_NO_IMPL
+
+// 16-bit Gough
+
+char nfaExecGough16_testEOD(const struct NFA *nfa, const char *state,
+                            const char *streamState, u64a offset,
+                            NfaCallback callback, void *context);
+char nfaExecGough16_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecGough16_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecGough16_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecGough16_reportCurrent(const struct NFA *n, struct mq *q);
+char nfaExecGough16_inAccept(const struct NFA *n, ReportID report, struct mq *q);
+char nfaExecGough16_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecGough16_queueInitState(const struct NFA *n, struct mq *q);
+char nfaExecGough16_initCompressedState(const struct NFA *n, u64a offset,
+                                        void *state, u8 key);
+char nfaExecGough16_queueCompressState(const struct NFA *nfa,
+                                       const struct mq *q, s64a loc);
+char nfaExecGough16_expandState(const struct NFA *nfa, void *dest,
+                                const void *src, u64a offset, u8 key);
+
+#define nfaExecGough16_B_Reverse NFA_API_NO_IMPL
+#define nfaExecGough16_zombie_status NFA_API_ZOMBIE_NO_IMPL
+
+#endif
diff --git a/regex/nfa/gough_internal.h b/regex/nfa/gough_internal.h
new file mode 100644
index 000000000..8bf06e0f7
--- /dev/null
+++ b/regex/nfa/gough_internal.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef GOUGH_INTERNAL_H
+#define GOUGH_INTERNAL_H
+
+#include "accel.h"
+#include "mcclellan_internal.h"
+#include "ue2common.h"
+
+#define INVALID_SLOT (~0U)
+
+#define GOUGH_INS_END 0
+#define GOUGH_INS_MOV 1
+#define GOUGH_INS_NEW 2
+#define GOUGH_INS_MIN 3
+/* todo: add instructions targeting acc reg? */
+
+struct gough_ins {
+    u32 op; /* u32 to avoid padding */
+    u32 dest;
+    u32 src; /* for GOUGH_INS_NEW, this specifies the adjustment to apply to the
+              * current offset */
+};
+
+/*
+ * HAPPY FUN ASCII ART TIME
+ *
+ * ----
+ * |  | struct NFA
+ * ----
+ * ~~~~ normal(ish) mcclellan engine
+ * ~~~~
+ * ~~~~
+ * ~~~~
+ * ~~~~
+ * ~~~~
+ * ~~~~
+ * ~~~~
+ * ---- = m->haig_offset
+ * |  | } struct gough_info
+ * ----
+ * |  | }
+ * |  | } edge prog table -> provides the offset of the start of the program
+ * |  | }                    to run when the edge is taken. 0 indicates no
+ * |  | }                    work to do
+ * ---- = h->top_prog_offset
+ * |  | }
+ * |  | } top prog table  -> provides the offset of the start of the program
+ * |  | }                    to run when a top is taken from this state. 0
+ * |  | }                    indicates nothing to do
+ * ---- = h->prog_base_offset
+ * |  | }
+ * |  | } programs to run
+ * |  | }
+ * |  | }
+ * ----
+ */
+
+struct gough_info {
+    u32 top_prog_offset; /**< offset to the base of the top prog table */
+    u32 prog_base_offset; /**< not used at runtime */
+    u32 stream_som_loc_count; /**< number of som locs in the stream state */
+    u8 stream_som_loc_width;  /**< number of bytes per som loc */
+};
+
+static really_inline
+const struct gough_info *get_gough(const struct mcclellan *m) {
+    assert(m->haig_offset);
+    const char *n = (const char *)m - sizeof(struct NFA);
+    return (const struct gough_info *)(n + m->haig_offset);
+}
+
+static really_inline
+const u32 *get_gough_top_offsets(const struct mcclellan *m) {
+    const struct gough_info *g = get_gough(m);
+    if (!g->top_prog_offset) {
+        return NULL;
+    }
+    const char *n = (const char *)m - sizeof(struct NFA);
+    return (const u32 *)(n + g->top_prog_offset);
+}
+
+/* Gough state representation in scratch.
+ *
+ * During execution, gough tracks a number of variables containing potential
+ * starts of match. These are all stored in a large array of u64a slots.
+ */
+struct gough_som_info {
+    u64a slots[1]; /* 'flexible' member array */
+};
+
+struct gough_report {
+    ReportID r;
+    u32 som; /* som slot to report */
+};
+
+struct gough_report_list {
+    u32 count;
+    struct gough_report report[];
+};
+
+struct gough_accel {
+    union AccelAux accel;
+    u8 margin_dist;
+    u32 prog_offset;
+};
+
+#endif
diff --git a/regex/nfa/lbr.c b/regex/nfa/lbr.c
new file mode 100644
index 000000000..d403733a6
--- /dev/null
+++ b/regex/nfa/lbr.c
@@ -0,0 +1,531 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Large Bounded Repeat (LBR) engine: runtime code.
+ */
+#include "lbr.h"
+
+#include "lbr_internal.h"
+#include "nfa_api.h"
+#include "nfa_api_queue.h"
+#include "nfa_internal.h"
+#include "repeat.h"
+#include "repeat_internal.h"
+#include "shufti.h"
+#include "truffle.h"
+#include "vermicelli.h"
+#include "util/partial_store.h"
+#include "util/unaligned.h"
+
+/** \brief Sentinel value used to indicate that a repeat is dead/empty/unused.
+ *  * */
+#define REPEAT_DEAD 0xffffffffffffffffull
+
+enum MatchMode {
+    CALLBACK_OUTPUT,
+    STOP_AT_MATCH,
+};
+
+static really_inline
+const struct RepeatInfo *getRepeatInfo(const struct lbr_common *l) {
+    const struct RepeatInfo *repeatInfo =
+        (const struct RepeatInfo *)((const char *)l + l->repeatInfoOffset);
+    return repeatInfo;
+}
+
+static really_inline
+void lbrCompressState(const struct lbr_common *l, u64a offset,
+                      const struct lbr_state *lstate, char *stream_state) {
+    assert(l && lstate && stream_state);
+    assert(ISALIGNED(lstate));
+
+    const struct RepeatInfo *info = getRepeatInfo(l);
+    repeatPack(stream_state, info, &lstate->ctrl, offset);
+}
+
+static really_inline
+void lbrExpandState(const struct lbr_common *l, u64a offset,
+                    const char *stream_state, struct lbr_state *lstate) {
+    assert(l && stream_state && lstate);
+    assert(ISALIGNED(lstate));
+
+    const struct RepeatInfo *info = getRepeatInfo(l);
+    repeatUnpack(stream_state, info, offset, &lstate->ctrl);
+    lstate->lastEscape = 0;
+}
+
+static really_inline
+void clearRepeat(const struct RepeatInfo *info, struct lbr_state *lstate) {
+    assert(info && lstate);
+
+    DEBUG_PRINTF("clear repeat at %p\n", lstate);
+
+    switch ((enum RepeatType)info->type) {
+    case REPEAT_RING:
+        lstate->ctrl.ring.offset = REPEAT_DEAD;
+        break;
+    case REPEAT_RANGE:
+        lstate->ctrl.range.offset = REPEAT_DEAD;
+        break;
+    case REPEAT_FIRST:
+    case REPEAT_LAST:
+        lstate->ctrl.offset.offset = REPEAT_DEAD;
+        break;
+    case REPEAT_BITMAP:
+        lstate->ctrl.bitmap.offset = REPEAT_DEAD;
+        break;
+    case REPEAT_SPARSE_OPTIMAL_P:
+        lstate->ctrl.ring.offset = REPEAT_DEAD;
+        break;
+    case REPEAT_TRAILER:
+        lstate->ctrl.trailer.offset = REPEAT_DEAD;
+        break;
+    default:
+        assert(0);
+        break;
+    }
+}
+
+static really_inline
+char repeatIsDead(const struct RepeatInfo *info,
+                  const struct lbr_state *lstate) {
+    assert(info && lstate);
+
+    switch ((enum RepeatType)info->type) {
+    case REPEAT_RING:
+        return lstate->ctrl.ring.offset == REPEAT_DEAD;
+    case REPEAT_RANGE:
+        return lstate->ctrl.range.offset == REPEAT_DEAD;
+    case REPEAT_FIRST:
+    case REPEAT_LAST:
+        return lstate->ctrl.offset.offset == REPEAT_DEAD;
+    case REPEAT_BITMAP:
+        return lstate->ctrl.bitmap.offset == REPEAT_DEAD;
+    case REPEAT_SPARSE_OPTIMAL_P:
+        return lstate->ctrl.ring.offset == REPEAT_DEAD;
+    case REPEAT_TRAILER:
+        return lstate->ctrl.trailer.offset == REPEAT_DEAD;
+    case REPEAT_ALWAYS:
+        assert(!"REPEAT_ALWAYS should only be used by Castle");
+        return 0;
+    }
+
+    assert(0);
+    return 1;
+}
+
+/** Returns true if the LBR can produce matches at offsets greater than the
+ * given one. TODO: can this be combined with lbrIsActive? */
+static really_inline
+char lbrIsAlive(const struct lbr_common *l, const struct lbr_state *lstate,
+                const char *state, u64a offset) {
+    assert(l && lstate && state);
+
+    const struct RepeatInfo *info = getRepeatInfo(l);
+    if (repeatIsDead(info, lstate)) {
+        DEBUG_PRINTF("repeat is dead\n");
+        return 0;
+    }
+
+    if (info->repeatMax == REPEAT_INF) {
+        DEBUG_PRINTF("active repeat with inf max bound, alive\n");
+        return 1;
+    }
+
+    assert(info->repeatMax < REPEAT_INF);
+    const char *repeatState = state + info->packedCtrlSize;
+    u64a lastTop = repeatLastTop(info, &lstate->ctrl, repeatState);
+    if (offset < lastTop + info->repeatMax) {
+        DEBUG_PRINTF("alive, as we can still produce matches after %llu\n",
+                     offset);
+        return 1;
+    }
+
+    DEBUG_PRINTF("dead\n");
+    return 0;
+}
+
+/** Returns true if the LBR is matching at the given offset or it could produce
+ * a match in the future. */
+static really_inline
+char lbrIsActive(const struct lbr_common *l, const struct lbr_state *lstate,
+                 const char *state, u64a offset) {
+    assert(l && lstate && state);
+    const struct RepeatInfo *info = getRepeatInfo(l);
+    assert(!repeatIsDead(info, lstate)); // Guaranteed by caller.
+
+    const char *repeatState = state + info->packedCtrlSize;
+    if (repeatHasMatch(info, &lstate->ctrl, repeatState, offset) ==
+        REPEAT_MATCH) {
+        DEBUG_PRINTF("currently matching\n");
+        return 1;
+    }
+
+    u64a i = repeatNextMatch(info, &lstate->ctrl, repeatState, offset);
+    if (i != 0) {
+        DEBUG_PRINTF("active, next match is at %llu\n", i);
+        return 1;
+    }
+
+    DEBUG_PRINTF("no more matches\n");
+    return 0;
+}
+
+static really_inline
+void lbrTop(const struct lbr_common *l, struct lbr_state *lstate, char *state,
+            u64a offset) {
+    assert(l && lstate && state);
+    DEBUG_PRINTF("top at %llu\n", offset);
+
+    const struct RepeatInfo *info = getRepeatInfo(l);
+    char *repeatState = state + info->packedCtrlSize;
+
+    char is_alive = !repeatIsDead(info, lstate);
+    if (is_alive) {
+        // Ignore duplicate TOPs.
+        u64a last = repeatLastTop(info, &lstate->ctrl, repeatState);
+        assert(last <= offset);
+        if (last == offset) {
+            return;
+        }
+    }
+
+    repeatStore(info, &lstate->ctrl, repeatState, offset, is_alive);
+}
+
+static really_inline
+char lbrInAccept(const struct lbr_common *l, const struct lbr_state *lstate,
+                 const char *state, u64a offset, ReportID report) {
+    assert(l && lstate && state);
+    DEBUG_PRINTF("offset=%llu, report=%u\n", offset, report);
+
+    if (report != l->report) {
+        DEBUG_PRINTF("report=%u is not LBR report %u\n", report, l->report);
+        return 0;
+    }
+
+    const struct RepeatInfo *info = getRepeatInfo(l);
+    assert(!repeatIsDead(info, lstate)); // Guaranteed by caller.
+
+    const char *repeatState = state + info->packedCtrlSize;
+    return repeatHasMatch(info, &lstate->ctrl, repeatState, offset) ==
+           REPEAT_MATCH;
+}
+
+static really_inline
+char lbrFindMatch(const struct lbr_common *l, const u64a begin, const u64a end,
+                  const struct lbr_state *lstate, const char *state,
+                  size_t *mloc) {
+    DEBUG_PRINTF("begin=%llu, end=%llu\n", begin, end);
+    assert(begin <= end);
+
+    if (begin == end) {
+        return 0;
+    }
+
+    const struct RepeatInfo *info = getRepeatInfo(l);
+    const char *repeatState = state + info->packedCtrlSize;
+    u64a i = repeatNextMatch(info, &lstate->ctrl, repeatState, begin);
+    if (i == 0) {
+        DEBUG_PRINTF("no more matches\n");
+        return 0;
+    }
+    if (i > end) {
+        DEBUG_PRINTF("next match at %llu is beyond the horizon\n", i);
+        return 0;
+    }
+
+    DEBUG_PRINTF("stop at match at %llu\n", i);
+    assert(mloc);
+    *mloc = i - begin;
+    return 1;
+}
+
+static really_inline
+char lbrMatchLoop(const struct lbr_common *l, const u64a begin, const u64a end,
+                  const struct lbr_state *lstate, const char *state,
+                  NfaCallback cb, void *ctx) {
+    DEBUG_PRINTF("begin=%llu, end=%llu\n", begin, end);
+    assert(begin <= end);
+
+    if (begin == end) {
+        return MO_CONTINUE_MATCHING;
+    }
+
+    const struct RepeatInfo *info = getRepeatInfo(l);
+    const char *repeatState = state + info->packedCtrlSize;
+
+    u64a i = begin;
+    for (;;) {
+        i = repeatNextMatch(info, &lstate->ctrl, repeatState, i);
+        if (i == 0) {
+            DEBUG_PRINTF("no more matches\n");
+            return MO_CONTINUE_MATCHING;
+        }
+        if (i > end) {
+            DEBUG_PRINTF("next match at %llu is beyond the horizon\n", i);
+            return MO_CONTINUE_MATCHING;
+        }
+
+        DEBUG_PRINTF("firing match at %llu\n", i);
+        if (cb(0, i, l->report, ctx) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING;
+        }
+    }
+
+    assert(0);
+    return MO_CONTINUE_MATCHING;
+}
+
+static really_inline
+char lbrRevScanDot(UNUSED const struct NFA *nfa, UNUSED const u8 *buf,
+                   UNUSED size_t begin, UNUSED size_t end,
+                   UNUSED size_t *loc) {
+    assert(begin <= end);
+    assert(nfa->type == LBR_NFA_DOT);
+    // Nothing can kill a dot!
+    return 0;
+}
+
+static really_inline
+char lbrRevScanVerm(const struct NFA *nfa, const u8 *buf,
+                    size_t begin, size_t end, size_t *loc) {
+    assert(begin <= end);
+    assert(nfa->type == LBR_NFA_VERM);
+    const struct lbr_verm *l = getImplNfa(nfa);
+
+    if (begin == end) {
+        return 0;
+    }
+
+    const u8 *ptr = rvermicelliExec(l->c, 0, buf + begin, buf + end);
+    if (ptr == buf + begin - 1) {
+        DEBUG_PRINTF("no escape found\n");
+        return 0;
+    }
+
+    assert(loc);
+    *loc = (size_t)(ptr - buf);
+    DEBUG_PRINTF("escape found at offset %zu\n", *loc);
+    assert((char)*ptr == l->c);
+    return 1;
+}
+
+static really_inline
+char lbrRevScanNVerm(const struct NFA *nfa, const u8 *buf,
+                     size_t begin, size_t end, size_t *loc) {
+    assert(begin <= end);
+    assert(nfa->type == LBR_NFA_NVERM);
+    const struct lbr_verm *l = getImplNfa(nfa);
+
+    if (begin == end) {
+        return 0;
+    }
+
+    const u8 *ptr = rnvermicelliExec(l->c, 0, buf + begin, buf + end);
+    if (ptr == buf + begin - 1) {
+        DEBUG_PRINTF("no escape found\n");
+        return 0;
+    }
+
+    assert(loc);
+    *loc = (size_t)(ptr - buf);
+    DEBUG_PRINTF("escape found at offset %zu\n", *loc);
+    assert((char)*ptr != l->c);
+    return 1;
+}
+
+static really_inline
+char lbrRevScanShuf(const struct NFA *nfa, const u8 *buf,
+                    size_t begin, size_t end,
+                    size_t *loc) {
+    assert(begin <= end);
+    assert(nfa->type == LBR_NFA_SHUF);
+    const struct lbr_shuf *l = getImplNfa(nfa);
+
+    if (begin == end) {
+        return 0;
+    }
+
+    const u8 *ptr = rshuftiExec(l->mask_lo, l->mask_hi, buf + begin, buf + end);
+    if (ptr == buf + begin - 1) {
+        DEBUG_PRINTF("no escape found\n");
+        return 0;
+    }
+
+    assert(loc);
+    *loc = (size_t)(ptr - buf);
+    DEBUG_PRINTF("escape found at offset %zu\n", *loc);
+    return 1;
+}
+
+static really_inline
+char lbrRevScanTruf(const struct NFA *nfa, const u8 *buf,
+                    size_t begin, size_t end,
+                    size_t *loc) {
+    assert(begin <= end);
+    assert(nfa->type == LBR_NFA_TRUF);
+    const struct lbr_truf *l = getImplNfa(nfa);
+
+    if (begin == end) {
+        return 0;
+    }
+
+    const u8 *ptr = rtruffleExec(l->mask1, l->mask2, buf + begin, buf + end);
+    if (ptr == buf + begin - 1) {
+        DEBUG_PRINTF("no escape found\n");
+        return 0;
+    }
+
+    assert(loc);
+    *loc = (size_t)(ptr - buf);
+    DEBUG_PRINTF("escape found at offset %zu\n", *loc);
+    return 1;
+}
+
+static really_inline
+char lbrFwdScanDot(UNUSED const struct NFA *nfa, UNUSED const u8 *buf,
+                   UNUSED size_t begin, UNUSED size_t end,
+                   UNUSED size_t *loc) {
+    assert(begin <= end);
+    assert(nfa->type == LBR_NFA_DOT);
+    // Nothing can kill a dot!
+    return 0;
+}
+
+static really_inline
+char lbrFwdScanVerm(const struct NFA *nfa, const u8 *buf,
+                    size_t begin, size_t end, size_t *loc) {
+    assert(begin <= end);
+    assert(nfa->type == LBR_NFA_VERM);
+    const struct lbr_verm *l = getImplNfa(nfa);
+
+    if (begin == end) {
+        return 0;
+    }
+
+    const u8 *ptr = vermicelliExec(l->c, 0, buf + begin, buf + end);
+    if (ptr == buf + end) {
+        DEBUG_PRINTF("no escape found\n");
+        return 0;
+    }
+
+    assert(loc);
+    *loc = (size_t)(ptr - buf);
+    DEBUG_PRINTF("escape found at offset %zu\n", *loc);
+    assert((char)*ptr == l->c);
+    return 1;
+}
+
+static really_inline
+char lbrFwdScanNVerm(const struct NFA *nfa, const u8 *buf,
+                     size_t begin, size_t end, size_t *loc) {
+    assert(begin <= end);
+    assert(nfa->type == LBR_NFA_NVERM);
+    const struct lbr_verm *l = getImplNfa(nfa);
+
+    if (begin == end) {
+        return 0;
+    }
+
+    const u8 *ptr = nvermicelliExec(l->c, 0, buf + begin, buf + end);
+    if (ptr == buf + end) {
+        DEBUG_PRINTF("no escape found\n");
+        return 0;
+    }
+
+    assert(loc);
+    *loc = (size_t)(ptr - buf);
+    DEBUG_PRINTF("escape found at offset %zu\n", *loc);
+    assert((char)*ptr != l->c);
+    return 1;
+}
+
+static really_inline
+char lbrFwdScanShuf(const struct NFA *nfa, const u8 *buf,
+                    size_t begin, size_t end,
+                    size_t *loc) {
+    assert(begin <= end);
+    assert(nfa->type == LBR_NFA_SHUF);
+    const struct lbr_shuf *l = getImplNfa(nfa);
+
+    if (begin == end) {
+        return 0;
+    }
+
+    const u8 *ptr = shuftiExec(l->mask_lo, l->mask_hi, buf + begin, buf + end);
+    if (ptr == buf + end) {
+        DEBUG_PRINTF("no escape found\n");
+        return 0;
+    }
+
+    assert(loc);
+    *loc = (size_t)(ptr - buf);
+    DEBUG_PRINTF("escape found at offset %zu\n", *loc);
+    return 1;
+}
+
+static really_inline
+char lbrFwdScanTruf(const struct NFA *nfa, const u8 *buf,
+                    size_t begin, size_t end,
+                    size_t *loc) {
+    assert(begin <= end);
+    assert(nfa->type == LBR_NFA_TRUF);
+    const struct lbr_truf *l = getImplNfa(nfa);
+
+    if (begin == end) {
+        return 0;
+    }
+
+    const u8 *ptr = truffleExec(l->mask1, l->mask2, buf + begin, buf + end);
+    if (ptr == buf + end) {
+        DEBUG_PRINTF("no escape found\n");
+        return 0;
+    }
+
+    assert(loc);
+    *loc = (size_t)(ptr - buf);
+    DEBUG_PRINTF("escape found at offset %zu\n", *loc);
+    return 1;
+}
+
+#define ENGINE_ROOT_NAME Dot
+#include "lbr_common_impl.h"
+
+#define ENGINE_ROOT_NAME Verm
+#include "lbr_common_impl.h"
+
+#define ENGINE_ROOT_NAME NVerm
+#include "lbr_common_impl.h"
+
+#define ENGINE_ROOT_NAME Shuf
+#include "lbr_common_impl.h"
+
+#define ENGINE_ROOT_NAME Truf
+#include "lbr_common_impl.h"
diff --git a/regex/nfa/lbr.h b/regex/nfa/lbr.h
new file mode 100644
index 000000000..a9e42046d
--- /dev/null
+++ b/regex/nfa/lbr.h
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef LBR_H
+#define LBR_H
+
+#include "ue2common.h"
+
+struct mq;
+struct NFA;
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+// LBR Dot
+
+char nfaExecLbrDot_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecLbrDot_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecLbrDot_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecLbrDot_reportCurrent(const struct NFA *n, struct mq *q);
+char nfaExecLbrDot_inAccept(const struct NFA *n, ReportID report, struct mq *q);
+char nfaExecLbrDot_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecLbrDot_queueInitState(const struct NFA *n, struct mq *q);
+char nfaExecLbrDot_initCompressedState(const struct NFA *n, u64a offset,
+                                       void *state, u8 key);
+char nfaExecLbrDot_queueCompressState(const struct NFA *nfa, const struct mq *q,
+                                      s64a loc);
+char nfaExecLbrDot_expandState(const struct NFA *nfa, void *dest,
+                               const void *src, u64a offset, u8 key);
+
+#define nfaExecLbrDot_testEOD NFA_API_NO_IMPL
+#define nfaExecLbrDot_B_Reverse NFA_API_NO_IMPL
+#define nfaExecLbrDot_zombie_status NFA_API_ZOMBIE_NO_IMPL
+
+// LBR Verm
+
+char nfaExecLbrVerm_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecLbrVerm_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecLbrVerm_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecLbrVerm_reportCurrent(const struct NFA *n, struct mq *q);
+char nfaExecLbrVerm_inAccept(const struct NFA *n, ReportID report,
+                             struct mq *q);
+char nfaExecLbrVerm_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecLbrVerm_queueInitState(const struct NFA *n, struct mq *q);
+char nfaExecLbrVerm_initCompressedState(const struct NFA *n, u64a offset,
+                                        void *state, u8 key);
+char nfaExecLbrVerm_queueCompressState(const struct NFA *nfa,
+                                       const struct mq *q, s64a loc);
+char nfaExecLbrVerm_expandState(const struct NFA *nfa, void *dest,
+                                const void *src, u64a offset, u8 key);
+
+#define nfaExecLbrVerm_testEOD NFA_API_NO_IMPL
+#define nfaExecLbrVerm_B_Reverse NFA_API_NO_IMPL
+#define nfaExecLbrVerm_zombie_status NFA_API_ZOMBIE_NO_IMPL
+
+// LBR Negated Verm
+
+char nfaExecLbrNVerm_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecLbrNVerm_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecLbrNVerm_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecLbrNVerm_reportCurrent(const struct NFA *n, struct mq *q);
+char nfaExecLbrNVerm_inAccept(const struct NFA *n, ReportID report,
+                              struct mq *q);
+char nfaExecLbrNVerm_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecLbrNVerm_queueInitState(const struct NFA *n, struct mq *q);
+char nfaExecLbrNVerm_initCompressedState(const struct NFA *n, u64a offset,
+                                         void *state, u8 key);
+char nfaExecLbrNVerm_queueCompressState(const struct NFA *nfa,
+                                        const struct mq *q, s64a loc);
+char nfaExecLbrNVerm_expandState(const struct NFA *nfa, void *dest,
+                                 const void *src, u64a offset, u8 key);
+
+#define nfaExecLbrNVerm_testEOD NFA_API_NO_IMPL
+#define nfaExecLbrNVerm_B_Reverse NFA_API_NO_IMPL
+#define nfaExecLbrNVerm_zombie_status NFA_API_ZOMBIE_NO_IMPL
+
+// LBR Shuf
+
+char nfaExecLbrShuf_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecLbrShuf_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecLbrShuf_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecLbrShuf_reportCurrent(const struct NFA *n, struct mq *q);
+char nfaExecLbrShuf_inAccept(const struct NFA *n, ReportID report,
+                             struct mq *q);
+char nfaExecLbrShuf_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecLbrShuf_queueInitState(const struct NFA *n, struct mq *q);
+char nfaExecLbrShuf_initCompressedState(const struct NFA *n, u64a offset,
+                                        void *state, u8 key);
+char nfaExecLbrShuf_queueCompressState(const struct NFA *nfa,
+                                       const struct mq *q, s64a loc);
+char nfaExecLbrShuf_expandState(const struct NFA *nfa, void *dest,
+                                const void *src, u64a offset, u8 key);
+
+#define nfaExecLbrShuf_testEOD NFA_API_NO_IMPL
+#define nfaExecLbrShuf_B_Reverse NFA_API_NO_IMPL
+#define nfaExecLbrShuf_zombie_status NFA_API_ZOMBIE_NO_IMPL
+
+// LBR Truffle
+
+char nfaExecLbrTruf_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecLbrTruf_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecLbrTruf_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecLbrTruf_reportCurrent(const struct NFA *n, struct mq *q);
+char nfaExecLbrTruf_inAccept(const struct NFA *n, ReportID report,
+                             struct mq *q);
+char nfaExecLbrTruf_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecLbrTruf_queueInitState(const struct NFA *n, struct mq *q);
+char nfaExecLbrTruf_initCompressedState(const struct NFA *n, u64a offset,
+                                        void *state, u8 key);
+char nfaExecLbrTruf_queueCompressState(const struct NFA *nfa,
+                                       const struct mq *q, s64a loc);
+char nfaExecLbrTruf_expandState(const struct NFA *nfa, void *dest,
+                                const void *src, u64a offset, u8 key);
+
+#define nfaExecLbrTruf_testEOD NFA_API_NO_IMPL
+#define nfaExecLbrTruf_B_Reverse NFA_API_NO_IMPL
+#define nfaExecLbrTruf_zombie_status NFA_API_ZOMBIE_NO_IMPL
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/regex/nfa/lbr_common_impl.h b/regex/nfa/lbr_common_impl.h
new file mode 100644
index 000000000..5ae35431e
--- /dev/null
+++ b/regex/nfa/lbr_common_impl.h
@@ -0,0 +1,462 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Large Bounded Repeat (LBR) engine: runtime impl X-macros.
+ */
+
+#include "util/join.h"
+
+#define ENGINE_EXEC_NAME JOIN(nfaExecLbr, ENGINE_ROOT_NAME)
+#define EXEC_FN JOIN(lbrExec, ENGINE_ROOT_NAME)
+#define FWDSCAN_FN JOIN(lbrFwdScan, ENGINE_ROOT_NAME)
+#define REVSCAN_FN JOIN(lbrRevScan, ENGINE_ROOT_NAME)
+
+char JOIN(ENGINE_EXEC_NAME, _queueCompressState)(const struct NFA *nfa,
+                                                 const struct mq *q, s64a loc) {
+    assert(nfa && q);
+    assert(isLbrType(nfa->type));
+    DEBUG_PRINTF("entry, q->offset=%llu, loc=%lld\n", q->offset, loc);
+
+    const struct lbr_common *l = getImplNfa(nfa);
+    const struct lbr_state *lstate = (const struct lbr_state *)q->state;
+
+    u64a offset = q->offset + loc;
+    lbrCompressState(l, offset, lstate, q->streamState);
+    return 0;
+}
+
+char JOIN(ENGINE_EXEC_NAME, _expandState)(const struct NFA *nfa, void *dest,
+                                          const void *src, u64a offset,
+                                          UNUSED u8 key) {
+    assert(nfa);
+    assert(isLbrType(nfa->type));
+    DEBUG_PRINTF("entry, offset=%llu\n", offset);
+
+    const struct lbr_common *l = getImplNfa(nfa);
+    struct lbr_state *lstate = (struct lbr_state *)dest;
+    lbrExpandState(l, offset, src, lstate);
+    return 0;
+}
+
+char JOIN(ENGINE_EXEC_NAME, _reportCurrent)(const struct NFA *nfa,
+                                            struct mq *q) {
+    assert(nfa && q);
+    assert(isLbrType(nfa->type));
+
+    const struct lbr_common *l = getImplNfa(nfa);
+    u64a offset = q_cur_offset(q);
+    DEBUG_PRINTF("firing match %u at %llu\n", l->report, offset);
+    q->cb(0, offset, l->report, q->context);
+    return 0;
+}
+
+char JOIN(ENGINE_EXEC_NAME, _inAccept)(const struct NFA *nfa,
+                                       ReportID report, struct mq *q) {
+    assert(nfa && q);
+    assert(isLbrType(nfa->type));
+    DEBUG_PRINTF("entry\n");
+
+    const struct lbr_common *l = getImplNfa(nfa);
+    const struct RepeatInfo *info = getRepeatInfo(l);
+    const struct lbr_state *lstate = (const struct lbr_state *)q->state;
+    if (repeatIsDead(info, lstate)) {
+        DEBUG_PRINTF("repeat is dead\n");
+        return 0;
+    }
+
+    u64a offset = q->offset + q_last_loc(q);
+    return lbrInAccept(l, lstate, q->streamState, offset, report);
+}
+
+char JOIN(ENGINE_EXEC_NAME, _inAnyAccept)(const struct NFA *nfa, struct mq *q) {
+    assert(nfa && q);
+    assert(isLbrType(nfa->type));
+    DEBUG_PRINTF("entry\n");
+
+    const struct lbr_common *l = getImplNfa(nfa);
+    return JOIN(ENGINE_EXEC_NAME, _inAccept)(nfa, l->report, q);
+}
+
+char JOIN(ENGINE_EXEC_NAME, _queueInitState)(const struct NFA *nfa,
+                                             struct mq *q) {
+    assert(nfa && q);
+    assert(isLbrType(nfa->type));
+    DEBUG_PRINTF("entry\n");
+
+    const struct lbr_common *l = getImplNfa(nfa);
+    const struct RepeatInfo *info = getRepeatInfo(l);
+
+    assert(q->state);
+    struct lbr_state *lstate = (struct lbr_state *)q->state;
+    assert(ISALIGNED(lstate));
+
+    lstate->lastEscape = 0;
+    clearRepeat(info, lstate);
+
+    return 0;
+}
+
+char JOIN(ENGINE_EXEC_NAME, _initCompressedState)(const struct NFA *nfa,
+                                                  u64a offset,
+                                                  void *state, UNUSED u8 key) {
+    assert(nfa && state);
+    assert(isLbrType(nfa->type));
+    DEBUG_PRINTF("entry\n");
+
+    const struct lbr_common *l = getImplNfa(nfa);
+    const struct RepeatInfo *info = getRepeatInfo(l);
+    struct lbr_state lstate; // temp control block on stack.
+    clearRepeat(info, &lstate);
+    lbrTop(l, &lstate, state, offset);
+    lbrCompressState(l, offset, &lstate, state);
+
+    return 1; // LBR is alive
+}
+
+// FIXME: this function could be much simpler for a Dot LBR, as all it needs to
+// do is find the next top.
+static really_inline
+char JOIN(ENGINE_EXEC_NAME, _TopScan)(const struct NFA *nfa, struct mq *q,
+                                      s64a end) {
+    const struct lbr_common *l = getImplNfa(nfa);
+    const struct RepeatInfo *info = getRepeatInfo(l);
+
+    const u64a offset = q->offset;
+    struct lbr_state *lstate = (struct lbr_state *)q->state;
+    assert(ISALIGNED(lstate));
+
+    assert(repeatIsDead(info, lstate));
+    assert(q->cur < q->end);
+
+    DEBUG_PRINTF("entry, end=%lld, offset=%llu, lastEscape=%llu\n", end,
+                  offset, lstate->lastEscape);
+
+    while (1) {
+        // Find the next top with location >= the last escape we saw.
+        for (; q->cur < q->end && q_cur_loc(q) <= end; q->cur++) {
+            u32 event = q_cur_type(q);
+            if ((event == MQE_TOP || event == MQE_TOP_FIRST) &&
+                q_cur_offset(q) >= lstate->lastEscape) {
+                goto found_top;
+            }
+            DEBUG_PRINTF("skip event type=%u offset=%lld\n", event, q_cur_offset(q));
+        }
+
+        // No more tops, we're done.
+        break;
+
+found_top:;
+        assert(q->cur < q->end);
+
+        u64a sp = q_cur_offset(q);
+        u64a first_match = sp + info->repeatMin;
+        DEBUG_PRINTF("first possible match is at %llu\n", first_match);
+
+        u64a ep = MIN(MIN(end, (s64a)q->length) + offset, first_match);
+        if (ep > sp && sp >= offset) {
+            size_t eloc;
+            DEBUG_PRINTF("rev b%llu e%llu/%zu\n", sp - offset, ep - offset,
+                         q->length);
+            assert(ep - offset <= q->length);
+            if (REVSCAN_FN(nfa, q->buffer, sp - offset, ep - offset, &eloc)) {
+                DEBUG_PRINTF("escape found at %llu\n", offset + eloc);
+                lstate->lastEscape = eloc;
+                q->cur++;
+                continue;
+            }
+        }
+
+        lbrTop(l, lstate, q->streamState, sp);
+        return 1;
+    }
+
+    DEBUG_PRINTF("exhausted queue\n");
+    return 0;
+}
+
+static really_inline
+char JOIN(ENGINE_EXEC_NAME, _Q_i)(const struct NFA *nfa, struct mq *q,
+                                  s64a end, enum MatchMode mode) {
+    assert(nfa && q);
+    assert(isLbrType(nfa->type));
+
+    const struct lbr_common *l = getImplNfa(nfa);
+    const struct RepeatInfo *info = getRepeatInfo(l);
+
+    struct lbr_state *lstate = (struct lbr_state *)q->state;
+    assert(ISALIGNED(lstate));
+
+
+    if (q->report_current) {
+        DEBUG_PRINTF("report_current: fire match at %llu\n", q_cur_offset(q));
+        int rv = q->cb(0, q_cur_offset(q), l->report, q->context);
+        q->report_current = 0;
+        if (rv == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING;
+        }
+    }
+
+    if (q->cur == q->end) {
+        return 1;
+    }
+
+    assert(q->cur + 1 < q->end); /* require at least two items */
+    assert(q_cur_type(q) == MQE_START);
+    u64a sp = q_cur_offset(q);
+    q->cur++;
+    DEBUG_PRINTF("sp=%llu, abs_end=%llu\n", sp, end + q->offset);
+
+    while (q->cur < q->end) {
+        DEBUG_PRINTF("q item type=%d offset=%llu\n", q_cur_type(q),
+                     q_cur_offset(q));
+
+        assert(sp >= q->offset); // not in history
+
+        if (repeatIsDead(info, lstate)) {
+            DEBUG_PRINTF("repeat is currently dead, skipping scan\n");
+            goto scan_done;
+        }
+
+        u64a ep = q_cur_offset(q);
+        ep = MIN(ep, q->offset + end);
+        if (sp < ep) {
+            size_t eloc = 0;
+            char escape_found = 0;
+            DEBUG_PRINTF("scanning from sp=%llu to ep=%llu\n", sp, ep);
+            assert(sp >= q->offset && ep >= q->offset);
+            if (FWDSCAN_FN(nfa, q->buffer, sp - q->offset, ep - q->offset, &eloc)) {
+                escape_found = 1;
+                ep = q->offset + eloc;
+                DEBUG_PRINTF("escape found at %llu\n", ep);
+                assert(ep >= sp);
+            }
+
+            assert(sp <= ep);
+
+            if (mode == STOP_AT_MATCH) {
+                size_t mloc;
+                if (lbrFindMatch(l, sp, ep, lstate, q->streamState, &mloc)) {
+                    DEBUG_PRINTF("storing match at %llu\n", sp + mloc);
+                    q->cur--;
+                    assert(q->cur < MAX_MQE_LEN);
+                    q->items[q->cur].type = MQE_START;
+                    q->items[q->cur].location = (s64a)(sp - q->offset) + mloc;
+                    return MO_MATCHES_PENDING;
+                }
+            } else {
+                assert(mode == CALLBACK_OUTPUT);
+                char rv = lbrMatchLoop(l, sp, ep, lstate, q->streamState, q->cb,
+                                       q->context);
+                if (rv == MO_HALT_MATCHING) {
+                    return MO_HALT_MATCHING;
+                }
+                assert(rv == MO_CONTINUE_MATCHING);
+            }
+
+            if (escape_found) {
+                DEBUG_PRINTF("clearing repeat due to escape\n");
+                clearRepeat(info, lstate);
+            }
+        }
+
+    scan_done:
+        if (q_cur_loc(q) > end) {
+            q->cur--;
+            assert(q->cur < MAX_MQE_LEN);
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = end;
+            return MO_ALIVE;
+        }
+
+        if (repeatIsDead(info, lstate)) {
+            if (!JOIN(ENGINE_EXEC_NAME, _TopScan)(nfa, q, end)) {
+                assert(repeatIsDead(info, lstate));
+                if (q->cur < q->end && q_cur_loc(q) > end) {
+                    q->cur--;
+                    assert(q->cur < MAX_MQE_LEN);
+                    q->items[q->cur].type = MQE_START;
+                    q->items[q->cur].location = end;
+                    return MO_ALIVE;
+                }
+                return 0;
+            }
+            DEBUG_PRINTF("cur offset = %llu\n", q_cur_offset(q));
+        } else {
+            switch (q_cur_type(q)) {
+            case MQE_TOP:
+            case MQE_TOP_FIRST:
+                lbrTop(l, lstate, q->streamState, q_cur_offset(q));
+                break;
+            case MQE_START:
+            case MQE_END:
+                break;
+            default:
+                DEBUG_PRINTF("unhandled event %d!\n", q_cur_type(q));
+                assert(0);
+                break;
+            }
+        }
+
+        sp = q_cur_offset(q);
+        q->cur++;
+    }
+
+    return lbrIsAlive(l, lstate, q->streamState, sp);
+}
+
+char JOIN(ENGINE_EXEC_NAME, _Q)(const struct NFA *nfa, struct mq *q, s64a end) {
+    DEBUG_PRINTF("entry, offset=%llu, end=%lld\n", q->offset, end);
+    return JOIN(ENGINE_EXEC_NAME, _Q_i)(nfa, q, end, CALLBACK_OUTPUT);
+}
+
+char JOIN(ENGINE_EXEC_NAME, _Q2)(const struct NFA *nfa, struct mq *q, s64a end) {
+    DEBUG_PRINTF("entry, offset=%llu, end=%lld\n", q->offset, end);
+    return JOIN(ENGINE_EXEC_NAME, _Q_i)(nfa, q, end, STOP_AT_MATCH);
+}
+
+static really_inline
+void JOIN(ENGINE_EXEC_NAME, _StreamSilent)(const struct NFA *nfa, struct mq *q,
+                                           const u8 *buf, size_t length) {
+    const struct lbr_common *l = getImplNfa(nfa);
+    const struct RepeatInfo *info = getRepeatInfo(l);
+    struct lbr_state *lstate = (struct lbr_state *)q->state;
+    assert(ISALIGNED(lstate));
+
+    assert(!repeatIsDead(info, lstate));
+
+    // This call doesn't produce matches, so we elide the lbrMatchLoop call
+    // entirely and just do escape scans to maintain the repeat.
+
+    size_t eloc = 0;
+    char escaped = FWDSCAN_FN(nfa, buf, 0, length, &eloc);
+    if (escaped) {
+        assert(eloc < length);
+        DEBUG_PRINTF("escape found at %zu, clearing repeat\n", eloc);
+        clearRepeat(info, lstate);
+    }
+}
+
+// Rose infix path.
+char JOIN(ENGINE_EXEC_NAME, _QR)(const struct NFA *nfa, struct mq *q,
+                                 ReportID report) {
+    assert(nfa && q);
+    assert(isLbrType(nfa->type));
+
+    if (q->cur == q->end) {
+        return 1;
+    }
+
+    assert(q->cur + 1 < q->end); /* require at least two items */
+    assert(q_cur_type(q) == MQE_START);
+    u64a sp = q_cur_offset(q);
+    q->cur++;
+    DEBUG_PRINTF("sp=%llu\n", sp);
+
+    const struct lbr_common *l = getImplNfa(nfa);
+    const struct RepeatInfo *info = getRepeatInfo(l);
+    struct lbr_state *lstate = (struct lbr_state *)q->state;
+    assert(ISALIGNED(lstate));
+    const s64a lastLoc = q_last_loc(q);
+
+    while (q->cur < q->end) {
+        DEBUG_PRINTF("q item type=%d offset=%llu\n", q_cur_type(q),
+                     q_cur_offset(q));
+
+        if (repeatIsDead(info, lstate)) {
+            DEBUG_PRINTF("repeat is dead\n");
+            goto scan_done;
+        }
+
+        u64a ep = q_cur_offset(q);
+
+        if (sp < q->offset) {
+            DEBUG_PRINTF("HISTORY BUFFER SCAN\n");
+            assert(q->offset - sp <= q->hlength);
+            u64a local_ep = MIN(q->offset, ep);
+            const u8 *ptr = q->history + q->hlength + sp - q->offset;
+            JOIN(ENGINE_EXEC_NAME, _StreamSilent)(nfa, q, ptr, local_ep - sp);
+            sp = local_ep;
+        }
+
+        if (repeatIsDead(info, lstate)) {
+            DEBUG_PRINTF("repeat is dead\n");
+            goto scan_done;
+        }
+
+        if (sp < ep) {
+            DEBUG_PRINTF("MAIN BUFFER SCAN\n");
+            assert(ep - q->offset <= q->length);
+            const u8 *ptr = q->buffer + sp - q->offset;
+            JOIN(ENGINE_EXEC_NAME, _StreamSilent)(nfa, q, ptr, ep - sp);
+        }
+
+        if (repeatIsDead(info, lstate)) {
+scan_done:
+            if (!JOIN(ENGINE_EXEC_NAME, _TopScan)(nfa, q, lastLoc)) {
+                assert(repeatIsDead(info, lstate));
+                assert(q->cur == q->end);
+                return 0;
+            }
+        } else {
+            switch (q_cur_type(q)) {
+            case MQE_TOP:
+            case MQE_TOP_FIRST:
+                lbrTop(l, lstate, q->streamState, q_cur_offset(q));
+                break;
+            case MQE_START:
+            case MQE_END:
+                break;
+            default:
+                DEBUG_PRINTF("unhandled event %d!\n", q_cur_type(q));
+                assert(0);
+                break;
+            }
+        }
+
+        sp = q_cur_offset(q);
+        q->cur++;
+    }
+
+    if (repeatIsDead(info, lstate)) {
+        DEBUG_PRINTF("repeat is dead\n");
+        return 0;
+    }
+
+    if (lbrInAccept(l, lstate, q->streamState, sp, report)) {
+        return MO_MATCHES_PENDING;
+    }
+
+    return lbrIsActive(l, lstate, q->streamState, sp);
+}
+
+#undef ENGINE_EXEC_NAME
+#undef EXEC_FN
+#undef FWDSCAN_FN
+#undef REVSCAN_FN
+#undef ENGINE_ROOT_NAME
diff --git a/regex/nfa/lbr_internal.h b/regex/nfa/lbr_internal.h
new file mode 100644
index 000000000..8ba11dd4d
--- /dev/null
+++ b/regex/nfa/lbr_internal.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Large Bounded Repeat (LBR): data structures.
+ */
+
+#ifndef LBR_INTERNAL_H
+#define LBR_INTERNAL_H
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#include "repeat_internal.h"
+
+/** \brief Common LBR header. */
+struct lbr_common {
+    u32 repeatInfoOffset;   //!< offset of RepeatInfo structure relative
+                            //   to the start of lbr_common
+    ReportID report;        //!< report to raise on match
+};
+
+struct lbr_dot {
+    struct lbr_common common;
+};
+
+struct lbr_verm {
+    struct lbr_common common;
+    char c; //!< escape char
+};
+
+struct lbr_shuf {
+    struct lbr_common common;
+    m128 mask_lo; //!< shufti lo mask for escape chars
+    m128 mask_hi; //!< shufti hi mask for escape chars
+};
+
+struct lbr_truf {
+    struct lbr_common common;
+    m128 mask1;
+    m128 mask2;
+};
+
+/** \brief Uncompressed ("full") state structure used by the LBR. This is
+ * stored in scratch, not in stream state. */
+struct lbr_state {
+    u64a lastEscape; //!< \brief offset of last escape seen.
+    union RepeatControl ctrl; //!< \brief repeat control block. */
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // LBR_INTERNAL_H
diff --git a/regex/nfa/limex.h b/regex/nfa/limex.h
new file mode 100644
index 000000000..0223604da
--- /dev/null
+++ b/regex/nfa/limex.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef LIMEX_H
+#define LIMEX_H
+
+#ifdef __cplusplus
+#include <string>
+extern "C"
+{
+#endif
+
+#include "nfa_api.h"
+
+#if defined(DUMP_SUPPORT) && defined(__cplusplus)
+#define GENERATE_NFA_DUMP_DECL(gf_name)                                        \
+    } /* extern "C" */                                                         \
+    namespace ue2 {                                                            \
+        void gf_name##_dump(const struct NFA *nfa, const std::string &base);   \
+    } /* namespace ue2 */                                                      \
+    extern "C" {
+
+#else
+#define GENERATE_NFA_DUMP_DECL(gf_name)
+#endif
+
+#define GENERATE_NFA_DECL(gf_name)                                             \
+    char gf_name##_testEOD(const struct NFA *nfa, const char *state,           \
+                           const char *streamState, u64a offset,               \
+                           NfaCallback callback, void *context);               \
+    char gf_name##_Q(const struct NFA *n, struct mq *q, s64a end);             \
+    char gf_name##_Q2(const struct NFA *n, struct mq *q, s64a end);            \
+    char gf_name##_QR(const struct NFA *n, struct mq *q, ReportID report);     \
+    char gf_name##_reportCurrent(const struct NFA *n, struct mq *q);           \
+    char gf_name##_inAccept(const struct NFA *n, ReportID report,              \
+                            struct mq *q);                                     \
+    char gf_name##_inAnyAccept(const struct NFA *n, struct mq *q);             \
+    char gf_name##_queueInitState(const struct NFA *n, struct mq *q);          \
+    char gf_name##_initCompressedState(const struct NFA *n, u64a offset,       \
+                                       void *state, u8 key);                   \
+    char gf_name##_B_Reverse(const struct NFA *n, u64a offset, const u8 *buf,  \
+                             size_t buflen, const u8 *hbuf, size_t hlen,       \
+                             NfaCallback cb, void *context);                   \
+    char gf_name##_queueCompressState(const struct NFA *nfa,                   \
+                                      const struct mq *q, s64a loc);           \
+    char gf_name##_expandState(const struct NFA *nfa, void *dest,              \
+                               const void *src, u64a offset, u8 key);          \
+    enum nfa_zombie_status gf_name##_zombie_status(const struct NFA *nfa,      \
+                                                   struct mq *q, s64a loc);    \
+    GENERATE_NFA_DUMP_DECL(gf_name)
+
+GENERATE_NFA_DECL(nfaExecLimEx32)
+GENERATE_NFA_DECL(nfaExecLimEx64)
+GENERATE_NFA_DECL(nfaExecLimEx128)
+GENERATE_NFA_DECL(nfaExecLimEx256)
+GENERATE_NFA_DECL(nfaExecLimEx384)
+GENERATE_NFA_DECL(nfaExecLimEx512)
+
+#undef GENERATE_NFA_DECL
+#undef GENERATE_NFA_DUMP_DECL
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/regex/nfa/limex_64.c b/regex/nfa/limex_64.c
new file mode 100644
index 000000000..e8f0880b2
--- /dev/null
+++ b/regex/nfa/limex_64.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief LimEx NFA: 128-bit SIMD runtime implementations.
+ */
+
+/* Limex64 is unusual on as on 32 bit platforms, at runtime it uses an m128 for
+ * state calculations.
+ */
+
+//#define DEBUG_INPUT
+//#define DEBUG_EXCEPTIONS
+
+#include "limex.h"
+
+#include "accel.h"
+#include "limex_internal.h"
+#include "nfa_internal.h"
+#include "ue2common.h"
+#include "util/bitutils.h"
+#include "util/simd_utils.h"
+
+// Common code
+#define STATE_ON_STACK
+#define ESTATE_ON_STACK
+
+#include "limex_runtime.h"
+
+#define SIZE          64
+#define ENG_STATE_T   u64a
+
+#ifdef ARCH_64_BIT
+#define STATE_T       u64a
+#define LOAD_FROM_ENG load_u64a
+#else
+#define STATE_T       m128
+#define LOAD_FROM_ENG load_m128_from_u64a
+#endif
+
+#include "limex_exceptional.h"
+
+#include "limex_state_impl.h"
+
+#define INLINE_ATTR really_inline
+#include "limex_common_impl.h"
+
+#include "limex_runtime_impl.h"
diff --git a/regex/nfa/limex_accel.c b/regex/nfa/limex_accel.c
new file mode 100644
index 000000000..4834b6a54
--- /dev/null
+++ b/regex/nfa/limex_accel.c
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Limex NFA: acceleration runtime.
+ */
+
+#include "limex_accel.h"
+
+#include "accel.h"
+#include "limex_internal.h"
+#include "limex_limits.h"
+#include "limex_shuffle.h"
+#include "nfa_internal.h"
+#include "shufti.h"
+#include "truffle.h"
+#include "ue2common.h"
+#include "vermicelli.h"
+#include "util/arch.h"
+#include "util/bitutils.h"
+#include "util/simd_utils.h"
+
+static really_inline
+size_t accelScanWrapper(const u8 *accelTable, const union AccelAux *aux,
+                        const u8 *input, u32 idx, size_t i, size_t end) {
+    assert(accelTable);
+    assert(aux);
+
+    DEBUG_PRINTF("shuffle returned %u -> aux %u\n", idx, accelTable[idx]);
+    assert(idx < (1 << NFA_MAX_ACCEL_STATES));
+    if (!idx) {
+        return end;
+    }
+
+    u8 aux_idx = accelTable[idx];
+    if (!aux_idx) {
+        assert(aux[0].accel_type == ACCEL_NONE);
+        DEBUG_PRINTF("no accel, bailing\n");
+        return i;
+    }
+
+    aux = aux + aux_idx;
+    const u8 *ptr = run_accel(aux, &input[i], &input[end]);
+    assert(ptr >= &input[i]);
+    size_t j = (size_t)(ptr - input);
+    DEBUG_PRINTF("accel skipped %zu of %zu chars\n", (j - i), (end - i));
+    DEBUG_PRINTF("returning j=%zu (i=%zu, end=%zu)\n", j, i, end);
+    return j;
+}
+
+size_t doAccel32(u32 s, u32 accel, const u8 *accelTable,
+                 const union AccelAux *aux, const u8 *input, size_t i,
+                 size_t end) {
+    u32 idx = pext32(s, accel);
+    return accelScanWrapper(accelTable, aux, input, idx, i, end);
+}
+
+#ifdef ARCH_64_BIT
+size_t doAccel64(u64a s, u64a accel, const u8 *accelTable,
+                 const union AccelAux *aux, const u8 *input, size_t i,
+                 size_t end) {
+    u32 idx = pext64(s, accel);
+    return accelScanWrapper(accelTable, aux, input, idx, i, end);
+}
+#else
+size_t doAccel64(m128 s, m128 accel, const u8 *accelTable,
+                 const union AccelAux *aux, const u8 *input, size_t i,
+                 size_t end) {
+    u32 idx = pext64(movq(s), movq(accel));
+    return accelScanWrapper(accelTable, aux, input, idx, i, end);
+}
+#endif
+
+size_t doAccel128(const m128 *state, const struct LimExNFA128 *limex,
+                  const u8 *accelTable, const union AccelAux *aux,
+                  const u8 *input, size_t i, size_t end) {
+    u32 idx;
+    m128 s = *state;
+    DEBUG_PRINTF("using PSHUFB for 128-bit shuffle\n");
+    m128 accelPerm = limex->accelPermute;
+    m128 accelComp = limex->accelCompare;
+    idx = packedExtract128(s, accelPerm, accelComp);
+    return accelScanWrapper(accelTable, aux, input, idx, i, end);
+}
+
+size_t doAccel256(const m256 *state, const struct LimExNFA256 *limex,
+                  const u8 *accelTable, const union AccelAux *aux,
+                  const u8 *input, size_t i, size_t end) {
+    u32 idx;
+    m256 s = *state;
+    DEBUG_PRINTF("using PSHUFB for 256-bit shuffle\n");
+    m256 accelPerm = limex->accelPermute;
+    m256 accelComp = limex->accelCompare;
+#if !defined(HAVE_AVX2)
+    u32 idx1 = packedExtract128(s.lo, accelPerm.lo, accelComp.lo);
+    u32 idx2 = packedExtract128(s.hi, accelPerm.hi, accelComp.hi);
+    assert((idx1 & idx2) == 0); // should be no shared bits
+    idx = idx1 | idx2;
+#else
+    idx = packedExtract256(s, accelPerm, accelComp);
+#endif
+    return accelScanWrapper(accelTable, aux, input, idx, i, end);
+}
+
+size_t doAccel384(const m384 *state, const struct LimExNFA384 *limex,
+                  const u8 *accelTable, const union AccelAux *aux,
+                  const u8 *input, size_t i, size_t end) {
+    u32 idx;
+    m384 s = *state;
+    DEBUG_PRINTF("using PSHUFB for 384-bit shuffle\n");
+    m384 accelPerm = limex->accelPermute;
+    m384 accelComp = limex->accelCompare;
+    u32 idx1 = packedExtract128(s.lo, accelPerm.lo, accelComp.lo);
+    u32 idx2 = packedExtract128(s.mid, accelPerm.mid, accelComp.mid);
+    u32 idx3 = packedExtract128(s.hi, accelPerm.hi, accelComp.hi);
+    assert((idx1 & idx2 & idx3) == 0); // should be no shared bits
+    idx = idx1 | idx2 | idx3;
+    return accelScanWrapper(accelTable, aux, input, idx, i, end);
+}
+
+size_t doAccel512(const m512 *state, const struct LimExNFA512 *limex,
+                  const u8 *accelTable, const union AccelAux *aux,
+                  const u8 *input, size_t i, size_t end) {
+    u32 idx;
+    m512 s = *state;
+    DEBUG_PRINTF("using PSHUFB for 512-bit shuffle\n");
+    m512 accelPerm = limex->accelPermute;
+    m512 accelComp = limex->accelCompare;
+#if defined(HAVE_AVX512)
+    idx = packedExtract512(s, accelPerm, accelComp);
+#elif defined(HAVE_AVX2)
+    u32 idx1 = packedExtract256(s.lo, accelPerm.lo, accelComp.lo);
+    u32 idx2 = packedExtract256(s.hi, accelPerm.hi, accelComp.hi);
+    assert((idx1 & idx2) == 0); // should be no shared bits
+    idx = idx1 | idx2;
+#else
+    u32 idx1 = packedExtract128(s.lo.lo, accelPerm.lo.lo, accelComp.lo.lo);
+    u32 idx2 = packedExtract128(s.lo.hi, accelPerm.lo.hi, accelComp.lo.hi);
+    u32 idx3 = packedExtract128(s.hi.lo, accelPerm.hi.lo, accelComp.hi.lo);
+    u32 idx4 = packedExtract128(s.hi.hi, accelPerm.hi.hi, accelComp.hi.hi);
+    assert((idx1 & idx2 & idx3 & idx4) == 0); // should be no shared bits
+    idx = idx1 | idx2 | idx3 | idx4;
+#endif
+    return accelScanWrapper(accelTable, aux, input, idx, i, end);
+}
diff --git a/regex/nfa/limex_accel.h b/regex/nfa/limex_accel.h
new file mode 100644
index 000000000..e5c94e82a
--- /dev/null
+++ b/regex/nfa/limex_accel.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Limex NFA: acceleration runtime.
+ *
+ * For the SIMD types (128 bits and above), we pass a pointer to the
+ * implementation NFA structure instead of three masks: otherwise we spend all
+ * our time building stack frames.
+ */
+
+#ifndef LIMEX_ACCEL_H
+#define LIMEX_ACCEL_H
+
+#include "util/simd_utils.h" // for m128 etc
+
+union AccelAux;
+struct LimExNFA64;
+struct LimExNFA128;
+struct LimExNFA256;
+struct LimExNFA384;
+struct LimExNFA512;
+
+size_t doAccel32(u32 s, u32 accel, const u8 *accelTable,
+                 const union AccelAux *aux, const u8 *input, size_t i,
+                 size_t end);
+
+#ifdef ARCH_64_BIT
+size_t doAccel64(u64a s, u64a accel, const u8 *accelTable,
+                 const union AccelAux *aux, const u8 *input, size_t i,
+                 size_t end);
+#else
+size_t doAccel64(m128 s, m128 accel, const u8 *accelTable,
+                 const union AccelAux *aux, const u8 *input, size_t i,
+                 size_t end);
+#endif
+
+size_t doAccel128(const m128 *s, const struct LimExNFA128 *limex,
+                  const u8 *accelTable, const union AccelAux *aux,
+                  const u8 *input, size_t i, size_t end);
+
+size_t doAccel256(const m256 *s, const struct LimExNFA256 *limex,
+                  const u8 *accelTable, const union AccelAux *aux,
+                  const u8 *input, size_t i, size_t end);
+
+size_t doAccel384(const m384 *s, const struct LimExNFA384 *limex,
+                  const u8 *accelTable, const union AccelAux *aux,
+                  const u8 *input, size_t i, size_t end);
+
+size_t doAccel512(const m512 *s, const struct LimExNFA512 *limex,
+                  const u8 *accelTable, const union AccelAux *aux,
+                  const u8 *input, size_t i, size_t end);
+
+#endif
diff --git a/regex/nfa/limex_common_impl.h b/regex/nfa/limex_common_impl.h
new file mode 100644
index 000000000..e441945d7
--- /dev/null
+++ b/regex/nfa/limex_common_impl.h
@@ -0,0 +1,431 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "repeat.h"
+#include "util/join.h"
+
+/* impl of limex functions which depend only on state size */
+
+#if !defined(SIZE) || !defined(STATE_T) || !defined(LOAD_FROM_ENG) \
+    || !defined(INLINE_ATTR)
+#  error Must define SIZE, STATE_T, LOAD_FROM_ENG and INLINE_ATTR in includer.
+#endif
+
+#define IMPL_NFA_T          JOIN(struct LimExNFA, SIZE)
+
+#define TESTEOD_FN          JOIN(moNfaTestEod, SIZE)
+#define LIMEX_INACCEPT_FN   JOIN(limexInAccept, SIZE)
+#define LIMEX_INANYACCEPT_FN   JOIN(limexInAnyAccept, SIZE)
+#define EXPIRE_ESTATE_FN    JOIN(limexExpireExtendedState, SIZE)
+#define REPORTCURRENT_FN    JOIN(moNfaReportCurrent, SIZE)
+#define INITIAL_FN          JOIN(moNfaInitial, SIZE)
+#define TOP_FN              JOIN(moNfaTop, SIZE)
+#define TOPN_FN             JOIN(moNfaTopN, SIZE)
+#define PROCESS_ACCEPTS_IMPL_FN  JOIN(moProcessAcceptsImpl, SIZE)
+#define PROCESS_ACCEPTS_FN  JOIN(moProcessAccepts, SIZE)
+#define PROCESS_ACCEPTS_NOSQUASH_FN  JOIN(moProcessAcceptsNoSquash, SIZE)
+#define CONTEXT_T           JOIN(NFAContext, SIZE)
+#define ONES_STATE          JOIN(ones_, STATE_T)
+#define AND_STATE           JOIN(and_, STATE_T)
+#define OR_STATE            JOIN(or_, STATE_T)
+#define ANDNOT_STATE        JOIN(andnot_, STATE_T)
+#define CLEARBIT_STATE      JOIN(clearbit_, STATE_T)
+#define TESTBIT_STATE       JOIN(testbit_, STATE_T)
+#define ISNONZERO_STATE     JOIN(isNonZero_, STATE_T)
+#define ISZERO_STATE        JOIN(isZero_, STATE_T)
+#define SQUASH_UNTUG_BR_FN  JOIN(lazyTug, SIZE)
+#define GET_NFA_REPEAT_INFO_FN JOIN(getNfaRepeatInfo, SIZE)
+
+#if defined(ARCH_64_BIT) && (SIZE >= 64)
+#define CHUNK_T u64a
+#define FIND_AND_CLEAR_FN findAndClearLSB_64
+#define POPCOUNT_FN popcount64
+#define RANK_IN_MASK_FN rank_in_mask64
+#else
+#define CHUNK_T u32
+#define FIND_AND_CLEAR_FN findAndClearLSB_32
+#define POPCOUNT_FN popcount32
+#define RANK_IN_MASK_FN rank_in_mask32
+#endif
+
+#define NUM_STATE_CHUNKS (sizeof(STATE_T) / sizeof(CHUNK_T))
+
+static really_inline
+void SQUASH_UNTUG_BR_FN(const IMPL_NFA_T *limex,
+                        const union RepeatControl *repeat_ctrl,
+                        const char *repeat_state, u64a offset,
+                        STATE_T *accstate) {
+    // switch off cyclic tug-accepts which aren't tuggable right now.
+
+    /* TODO: might be nice to work which br to examine based on accstate rather
+     * than iterating overall br */
+
+    if (!limex->repeatCount) {
+        return;
+    }
+
+    assert(repeat_ctrl);
+    assert(repeat_state);
+
+    for (u32 i = 0; i < limex->repeatCount; i++) {
+        const struct NFARepeatInfo *info = GET_NFA_REPEAT_INFO_FN(limex, i);
+
+        u32 cyclicState = info->cyclicState;
+        if (!TESTBIT_STATE(*accstate, cyclicState)) {
+            continue;
+        }
+
+        DEBUG_PRINTF("repeat %u (cyclic state %u) is active\n", i, cyclicState);
+        DEBUG_PRINTF("checking if offset %llu would match\n", offset);
+
+        const union RepeatControl *ctrl = repeat_ctrl + i;
+        const char *state = repeat_state + info->stateOffset;
+        const struct RepeatInfo *repeat = getRepeatInfo(info);
+        if (repeatHasMatch(repeat, ctrl, state, offset) != REPEAT_MATCH) {
+            DEBUG_PRINTF("not ready to accept yet\n");
+            CLEARBIT_STATE(accstate, cyclicState);
+        }
+    }
+}
+
+static really_inline
+char PROCESS_ACCEPTS_IMPL_FN(const IMPL_NFA_T *limex, const STATE_T *s,
+                             STATE_T *squash, const STATE_T *acceptMask,
+                             const struct NFAAccept *acceptTable, u64a offset,
+                             NfaCallback callback, void *context) {
+    assert(s);
+    assert(limex);
+    assert(callback);
+
+    const STATE_T accept_mask = *acceptMask;
+    STATE_T accepts = AND_STATE(*s, accept_mask);
+
+    // Caller must ensure that we have at least one accept state on.
+    assert(ISNONZERO_STATE(accepts));
+
+    CHUNK_T chunks[NUM_STATE_CHUNKS];
+    memcpy(chunks, &accepts, sizeof(accepts));
+
+    CHUNK_T mask_chunks[NUM_STATE_CHUNKS];
+    memcpy(mask_chunks, &accept_mask, sizeof(accept_mask));
+
+    u32 base_index = 0; // Cumulative sum of mask popcount up to current chunk.
+    for (u32 i = 0; i < NUM_STATE_CHUNKS; i++) {
+        CHUNK_T chunk = chunks[i];
+        while (chunk != 0) {
+            u32 bit = FIND_AND_CLEAR_FN(&chunk);
+            u32 local_idx = RANK_IN_MASK_FN(mask_chunks[i], bit);
+            u32 idx = local_idx + base_index;
+            const struct NFAAccept *a = &acceptTable[idx];
+            DEBUG_PRINTF("state %u: firing report list=%u, offset=%llu\n",
+                         bit + i * (u32)sizeof(chunk) * 8, a->reports, offset);
+            int rv = limexRunAccept((const char *)limex, a, callback, context,
+                                    offset);
+            if (unlikely(rv == MO_HALT_MATCHING)) {
+                return 1;
+            }
+            if (squash != NULL && a->squash != MO_INVALID_IDX) {
+                DEBUG_PRINTF("applying squash mask at offset %u\n", a->squash);
+                const ENG_STATE_T *sq =
+                    (const ENG_STATE_T *)((const char *)limex + a->squash);
+                *squash = AND_STATE(*squash, LOAD_FROM_ENG(sq));
+            }
+        }
+        base_index += POPCOUNT_FN(mask_chunks[i]);
+    }
+
+    return 0;
+}
+
+static never_inline
+char PROCESS_ACCEPTS_FN(const IMPL_NFA_T *limex, STATE_T *s,
+                        const STATE_T *acceptMask,
+                        const struct NFAAccept *acceptTable, u64a offset,
+                        NfaCallback callback, void *context) {
+    // We have squash masks we might have to apply after firing reports.
+    STATE_T squash = ONES_STATE;
+    return PROCESS_ACCEPTS_IMPL_FN(limex, s, &squash, acceptMask, acceptTable,
+                                   offset, callback, context);
+
+    *s = AND_STATE(*s, squash);
+}
+
+static never_inline
+char PROCESS_ACCEPTS_NOSQUASH_FN(const IMPL_NFA_T *limex, const STATE_T *s,
+                                 const STATE_T *acceptMask,
+                                 const struct NFAAccept *acceptTable,
+                                 u64a offset, NfaCallback callback,
+                                 void *context) {
+    STATE_T *squash = NULL;
+    return PROCESS_ACCEPTS_IMPL_FN(limex, s, squash, acceptMask, acceptTable,
+                                   offset, callback, context);
+}
+
+// Run EOD accepts. Note that repeat_ctrl and repeat_state may be NULL if this
+// LimEx contains no repeat structures.
+static really_inline
+char TESTEOD_FN(const IMPL_NFA_T *limex, const STATE_T *s,
+                const union RepeatControl *repeat_ctrl,
+                const char *repeat_state, u64a offset,
+                NfaCallback callback, void *context) {
+    assert(limex && s);
+
+    // There may not be any EOD accepts in this NFA.
+    if (!limex->acceptEodCount) {
+        return MO_CONTINUE_MATCHING;
+    }
+
+    const STATE_T acceptEodMask = LOAD_FROM_ENG(&limex->acceptAtEOD);
+    STATE_T foundAccepts = AND_STATE(*s, acceptEodMask);
+
+    SQUASH_UNTUG_BR_FN(limex, repeat_ctrl, repeat_state,
+                       offset + 1 /* EOD 'symbol' */, &foundAccepts);
+
+    if (unlikely(ISNONZERO_STATE(foundAccepts))) {
+        const struct NFAAccept *acceptEodTable = getAcceptEodTable(limex);
+        if (PROCESS_ACCEPTS_NOSQUASH_FN(limex, &foundAccepts, &acceptEodMask,
+                                        acceptEodTable, offset, callback,
+                                        context)) {
+            return MO_HALT_MATCHING;
+        }
+    }
+
+    return MO_CONTINUE_MATCHING;
+}
+
+// Run accepts corresponding to current state.
+static really_inline
+char REPORTCURRENT_FN(const IMPL_NFA_T *limex, const struct mq *q) {
+    assert(limex && q);
+    assert(q->state);
+    assert(q_cur_type(q) == MQE_START);
+
+    STATE_T s = *(STATE_T *)q->state;
+    STATE_T acceptMask = LOAD_FROM_ENG(&limex->accept);
+    STATE_T foundAccepts = AND_STATE(s, acceptMask);
+
+    if (unlikely(ISNONZERO_STATE(foundAccepts))) {
+        DEBUG_PRINTF("found accepts\n");
+        DEBUG_PRINTF("for nfa %p\n", limex);
+        const struct NFAAccept *acceptTable = getAcceptTable(limex);
+        u64a offset = q_cur_offset(q);
+
+        if (PROCESS_ACCEPTS_NOSQUASH_FN(limex, &foundAccepts, &acceptMask,
+                                        acceptTable, offset, q->cb,
+                                        q->context)) {
+            return MO_HALT_MATCHING;
+        }
+    }
+
+    return MO_CONTINUE_MATCHING;
+}
+
+static really_inline
+STATE_T INITIAL_FN(const IMPL_NFA_T *impl, char onlyDs) {
+    return LOAD_FROM_ENG(onlyDs ? &impl->initDS : &impl->init);
+}
+
+static really_inline
+STATE_T TOP_FN(const IMPL_NFA_T *impl, char onlyDs, STATE_T state) {
+    return OR_STATE(INITIAL_FN(impl, onlyDs), state);
+}
+
+static really_inline
+STATE_T TOPN_FN(const IMPL_NFA_T *limex, STATE_T state, u32 n) {
+    assert(n < limex->topCount);
+    const ENG_STATE_T *topsptr =
+        (const ENG_STATE_T *)((const char *)limex + limex->topOffset);
+    STATE_T top = LOAD_FROM_ENG(&topsptr[n]);
+    return OR_STATE(top, state);
+}
+
+static really_inline
+void EXPIRE_ESTATE_FN(const IMPL_NFA_T *limex, struct CONTEXT_T *ctx,
+                      u64a offset) {
+    assert(limex);
+    assert(ctx);
+
+    if (!limex->repeatCount) {
+        return;
+    }
+
+    DEBUG_PRINTF("expire estate at offset %llu\n", offset);
+
+    const STATE_T cyclics
+        = AND_STATE(ctx->s, LOAD_FROM_ENG(&limex->repeatCyclicMask));
+    if (ISZERO_STATE(cyclics)) {
+        DEBUG_PRINTF("no cyclic states are on\n");
+        return;
+    }
+
+    for (u32 i = 0; i < limex->repeatCount; i++) {
+        const struct NFARepeatInfo *info = GET_NFA_REPEAT_INFO_FN(limex, i);
+
+        u32 cyclicState = info->cyclicState;
+        if (!TESTBIT_STATE(cyclics, cyclicState)) {
+            continue;
+        }
+
+        DEBUG_PRINTF("repeat %u (cyclic state %u) is active\n", i,
+                     cyclicState);
+
+        const struct RepeatInfo *repeat = getRepeatInfo(info);
+        if (repeat->repeatMax == REPEAT_INF) {
+            continue; // can't expire
+        }
+
+        const union RepeatControl *repeat_ctrl = ctx->repeat_ctrl + i;
+        const char *repeat_state = ctx->repeat_state + info->stateOffset;
+        u64a last_top = repeatLastTop(repeat, repeat_ctrl, repeat_state);
+        assert(repeat->repeatMax < REPEAT_INF);
+        DEBUG_PRINTF("offset %llu, last_top %llu repeatMax %u\n", offset,
+                     last_top, repeat->repeatMax);
+        u64a adj = 0;
+        /* if the cycle's tugs are active at repeat max, it is still alive */
+        if (TESTBIT_STATE(LOAD_FROM_ENG(&limex->accept), cyclicState) ||
+            TESTBIT_STATE(LOAD_FROM_ENG(&limex->acceptAtEOD), cyclicState)) {
+            DEBUG_PRINTF("lazy tug possible - may still be inspected\n");
+            adj = 1;
+        } else {
+            const ENG_STATE_T *tug_mask =
+                (const ENG_STATE_T *)((const char *)info + info->tugMaskOffset);
+            if (ISNONZERO_STATE(AND_STATE(ctx->s, LOAD_FROM_ENG(tug_mask)))) {
+                DEBUG_PRINTF("tug possible - may still be inspected\n");
+                adj = 1;
+            }
+        }
+
+        if (offset >= last_top + repeat->repeatMax + adj) {
+            DEBUG_PRINTF("repeat state is stale, squashing state %u\n",
+                         cyclicState);
+            CLEARBIT_STATE(&ctx->s, cyclicState);
+        }
+    }
+}
+
+// Specialised inAccept call: LimEx NFAs with the "lazy tug" optimisation (see
+// UE-1636) need to guard cyclic tug-accepts as well.
+static really_inline
+char LIMEX_INACCEPT_FN(const IMPL_NFA_T *limex, STATE_T state,
+                       union RepeatControl *repeat_ctrl, char *repeat_state,
+                       u64a offset, ReportID report) {
+    assert(limex);
+
+    const STATE_T accept_mask = LOAD_FROM_ENG(&limex->accept);
+    STATE_T accepts = AND_STATE(state, accept_mask);
+
+    // Are we in an accept state?
+    if (ISZERO_STATE(accepts)) {
+        DEBUG_PRINTF("no accept states are on\n");
+        return 0;
+    }
+
+    SQUASH_UNTUG_BR_FN(limex, repeat_ctrl, repeat_state, offset, &accepts);
+
+    DEBUG_PRINTF("looking for report %u\n", report);
+
+    const struct NFAAccept *acceptTable = getAcceptTable(limex);
+
+    CHUNK_T chunks[NUM_STATE_CHUNKS];
+    memcpy(chunks, &accepts, sizeof(accepts));
+
+    CHUNK_T mask_chunks[NUM_STATE_CHUNKS];
+    memcpy(mask_chunks, &accept_mask, sizeof(accept_mask));
+
+    u32 base_index = 0; // Cumulative sum of mask popcount up to current chunk.
+    for (u32 i = 0; i < NUM_STATE_CHUNKS; i++) {
+        CHUNK_T chunk = chunks[i];
+        while (chunk != 0) {
+            u32 bit = FIND_AND_CLEAR_FN(&chunk);
+            u32 local_idx = RANK_IN_MASK_FN(mask_chunks[i], bit);
+            u32 idx = local_idx + base_index;
+            assert(idx < limex->acceptCount);
+            const struct NFAAccept *a = &acceptTable[idx];
+            DEBUG_PRINTF("state %u is on, report list at %u\n",
+                         bit + i * (u32)sizeof(chunk) * 8, a->reports);
+
+            if (limexAcceptHasReport((const char *)limex, a, report)) {
+                DEBUG_PRINTF("report %u is on\n", report);
+                return 1;
+            }
+        }
+        base_index += POPCOUNT_FN(mask_chunks[i]);
+    }
+
+    return 0;
+}
+
+static really_inline
+char LIMEX_INANYACCEPT_FN(const IMPL_NFA_T *limex, STATE_T state,
+                          union RepeatControl *repeat_ctrl, char *repeat_state,
+                          u64a offset) {
+    assert(limex);
+
+    const STATE_T acceptMask = LOAD_FROM_ENG(&limex->accept);
+    STATE_T accstate = AND_STATE(state, acceptMask);
+
+    // Are we in an accept state?
+    if (ISZERO_STATE(accstate)) {
+        DEBUG_PRINTF("no accept states are on\n");
+        return 0;
+    }
+
+    SQUASH_UNTUG_BR_FN(limex, repeat_ctrl, repeat_state, offset, &accstate);
+
+    return ISNONZERO_STATE(accstate);
+}
+
+#undef TESTEOD_FN
+#undef REPORTCURRENT_FN
+#undef EXPIRE_ESTATE_FN
+#undef LIMEX_INACCEPT_FN
+#undef LIMEX_INANYACCEPT_FN
+#undef INITIAL_FN
+#undef TOP_FN
+#undef TOPN_FN
+#undef CONTEXT_T
+#undef IMPL_NFA_T
+#undef ONES_STATE
+#undef AND_STATE
+#undef OR_STATE
+#undef ANDNOT_STATE
+#undef CLEARBIT_STATE
+#undef TESTBIT_STATE
+#undef ISNONZERO_STATE
+#undef ISZERO_STATE
+#undef PROCESS_ACCEPTS_IMPL_FN
+#undef PROCESS_ACCEPTS_FN
+#undef PROCESS_ACCEPTS_NOSQUASH_FN
+#undef SQUASH_UNTUG_BR_FN
+#undef GET_NFA_REPEAT_INFO_FN
+
+#undef CHUNK_T
+#undef FIND_AND_CLEAR_FN
+#undef POPCOUNT_FN
+#undef RANK_IN_MASK_FN
+#undef NUM_STATE_CHUNKS
diff --git a/regex/nfa/limex_context.h b/regex/nfa/limex_context.h
new file mode 100644
index 000000000..60d208793
--- /dev/null
+++ b/regex/nfa/limex_context.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Runtime context structures (NFAContext128 and friends) for the NFA.
+ */
+
+#ifndef LIMEX_CONTEXT_H
+#define LIMEX_CONTEXT_H
+
+#include "ue2common.h"
+#include "callback.h"
+#include "util/simd_utils.h" // for m128 etc
+
+// Runtime context structures.
+
+/* Note: The size of the context structures may vary from platform to platform
+ * (notably, for the Limex64 structure). As a result, information based on the
+ * size and other detail of these structures should not be written into the
+ * bytecode -- really, the details of the structure should not be accessed by
+ * the ue2 compile side at all.
+ */
+#ifdef __cplusplus
+#error ue2 runtime only file
+#endif
+
+/* cached_estate/esucc etc...
+ *
+ * If the exception state matches the cached_estate we will apply
+ * the or in the cached_esucc to the successor states rather than processing
+ * the exceptions.
+ *
+ * If the current exception state is a superset of the cached_estate, the
+ * cache is NOT used at all.
+ *
+ * The cache is updated when we see a different cacheable estate.
+ */
+
+#define GEN_CONTEXT_STRUCT(nsize, ntype)                                    \
+struct ALIGN_CL_DIRECTIVE NFAContext##nsize {                               \
+    ntype s; /**< state bitvector (on entry/exit) */                        \
+    ntype local_succ; /**< used by exception handling for large models */   \
+    ntype cached_estate; /* inited to 0 */                                  \
+    ntype cached_esucc;                                                     \
+    char cached_br; /**< cached_estate contains a br state */               \
+    const ReportID *cached_reports;                                         \
+    union RepeatControl *repeat_ctrl;                                       \
+    char *repeat_state;                                                     \
+    NfaCallback callback;                                                   \
+    void *context;                                                          \
+};
+
+GEN_CONTEXT_STRUCT(32,  u32)
+#ifdef ARCH_64_BIT
+GEN_CONTEXT_STRUCT(64,  u64a)
+#else
+GEN_CONTEXT_STRUCT(64,  m128)
+#endif
+GEN_CONTEXT_STRUCT(128, m128)
+GEN_CONTEXT_STRUCT(256, m256)
+GEN_CONTEXT_STRUCT(384, m384)
+GEN_CONTEXT_STRUCT(512, m512)
+
+#undef GEN_CONTEXT_STRUCT
+
+#endif
diff --git a/regex/nfa/limex_exceptional.h b/regex/nfa/limex_exceptional.h
new file mode 100644
index 000000000..6c7335f1b
--- /dev/null
+++ b/regex/nfa/limex_exceptional.h
@@ -0,0 +1,401 @@
+/*
+ * Copyright (c) 2015-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief LimEx NFA: runtime exception processing code.
+ *
+ * X-macro generic impl, included into the various LimEx model implementations.
+ */
+
+#if !defined(SIZE) || !defined(STATE_T) || !defined(LOAD_FROM_ENG)
+#  error Must define SIZE, STATE_T, LOAD_FROM_ENG in includer.
+#endif
+
+#include "config.h"
+#include "limex_ring.h"
+#include "util/join.h"
+#include "util/uniform_ops.h"
+
+#define PE_FN                   JOIN(processExceptional, SIZE)
+#define RUN_EXCEPTION_FN        JOIN(runException, SIZE)
+#define ZERO_STATE              JOIN(zero_, STATE_T)
+#define AND_STATE               JOIN(and_, STATE_T)
+#define EQ_STATE(a, b)          (!JOIN(noteq_, STATE_T)((a), (b)))
+#define OR_STATE                JOIN(or_, STATE_T)
+#define EXPAND_STATE            JOIN(expand_, STATE_T)
+#define SHUFFLE_BYTE_STATE      JOIN(shuffle_byte_, STATE_T)
+#define TESTBIT_STATE           JOIN(testbit_, STATE_T)
+#define EXCEPTION_T             JOIN(struct NFAException, SIZE)
+#define CONTEXT_T               JOIN(NFAContext, SIZE)
+#define IMPL_NFA_T              JOIN(LimExNFA, SIZE)
+#define GET_NFA_REPEAT_INFO_FN  JOIN(getNfaRepeatInfo, SIZE)
+
+#ifdef ESTATE_ON_STACK
+#define ESTATE_ARG STATE_T estate
+#else
+#define ESTATE_ARG const STATE_T *estatep
+#define estate (*estatep)
+#endif
+
+#ifdef STATE_ON_STACK
+#define STATE_ARG_NAME s
+#define STATE_ARG STATE_T STATE_ARG_NAME
+#define STATE_ARG_P &s
+#else
+#define STATE_ARG_NAME sp
+#define STATE_ARG const STATE_T *STATE_ARG_NAME
+#define STATE_ARG_P sp
+#endif
+
+#ifndef STATE_ON_STACK
+#define BIG_MODEL
+#endif
+
+#ifdef ARCH_64_BIT
+#define CHUNK_T u64a
+#define FIND_AND_CLEAR_FN findAndClearLSB_64
+#define POPCOUNT_FN popcount64
+#define RANK_IN_MASK_FN rank_in_mask64
+#else
+#define CHUNK_T u32
+#define FIND_AND_CLEAR_FN findAndClearLSB_32
+#define POPCOUNT_FN popcount32
+#define RANK_IN_MASK_FN rank_in_mask32
+#endif
+
+/** \brief Process a single exception. Returns 1 if exception handling should
+ * continue, 0 if an accept callback has instructed us to halt. */
+static really_inline
+int RUN_EXCEPTION_FN(const EXCEPTION_T *e, STATE_ARG,
+                     STATE_T *succ,
+#ifndef BIG_MODEL
+                     STATE_T *local_succ,
+#endif
+                     const struct IMPL_NFA_T *limex,
+                     u64a offset,
+                     struct CONTEXT_T *ctx,
+                     struct proto_cache *new_cache,
+                     enum CacheResult *cacheable,
+                     char in_rev,
+                     const char flags) {
+    assert(e);
+
+#ifdef DEBUG_EXCEPTIONS
+    printf("EXCEPTION e=%p reports=%u trigger=", e, e->reports);
+    if (e->trigger == LIMEX_TRIGGER_NONE) {
+        printf("none");
+    } else if (e->trigger == LIMEX_TRIGGER_POS) {
+        printf("pos");
+    } else if (e->trigger == LIMEX_TRIGGER_TUG) {
+        printf("tug");
+    } else {
+        printf("unknown!");
+    }
+    printf("\n");
+#endif
+
+    // Trigger exceptions, used in bounded repeats.
+    assert(!in_rev || e->trigger == LIMEX_TRIGGER_NONE);
+    if (!in_rev && e->trigger != LIMEX_TRIGGER_NONE) {
+        assert(e->repeatOffset != MO_INVALID_IDX);
+        const struct NFARepeatInfo *info =
+            (const struct NFARepeatInfo *)((const char *)limex +
+                                           e->repeatOffset);
+        const struct RepeatInfo *repeat = getRepeatInfo(info);
+        assert(ctx->repeat_ctrl && ctx->repeat_state);
+        union RepeatControl *repeat_ctrl = ctx->repeat_ctrl + info->ctrlIndex;
+        char *repeat_state = ctx->repeat_state + info->stateOffset;
+
+        if (e->trigger == LIMEX_TRIGGER_POS) {
+            char cyclic_on = TESTBIT_STATE(*STATE_ARG_P, info->cyclicState);
+            processPosTrigger(repeat, repeat_ctrl, repeat_state, offset,
+                              cyclic_on);
+            *cacheable = DO_NOT_CACHE_RESULT_AND_FLUSH_BR_ENTRIES;
+        } else {
+            assert(e->trigger == LIMEX_TRIGGER_TUG);
+            enum TriggerResult rv =
+                processTugTrigger(repeat, repeat_ctrl, repeat_state, offset);
+            if (rv == TRIGGER_FAIL) {
+                *cacheable = DO_NOT_CACHE_RESULT_AND_FLUSH_BR_ENTRIES;
+                DEBUG_PRINTF("tug found no valid matches in repeat state\n");
+                return 1; // continue
+            } else if (rv == TRIGGER_STALE) {
+                *cacheable = DO_NOT_CACHE_RESULT_AND_FLUSH_BR_ENTRIES;
+                DEBUG_PRINTF("stale history, squashing cyclic state\n");
+                assert(e->hasSquash == LIMEX_SQUASH_TUG);
+                *succ = AND_STATE(*succ, LOAD_FROM_ENG(&e->squash));
+                return 1; // continue
+            } else if (rv == TRIGGER_SUCCESS_CACHE) {
+                new_cache->br = 1;
+            } else {
+                assert(rv == TRIGGER_SUCCESS);
+                *cacheable = DO_NOT_CACHE_RESULT_AND_FLUSH_BR_ENTRIES;
+            }
+        }
+    }
+
+    // Some exceptions fire accepts.
+    if (e->reports != MO_INVALID_IDX) {
+        if (flags & CALLBACK_OUTPUT) {
+            const ReportID *reports =
+                (const ReportID *)((const char *)limex + e->reports);
+            if (unlikely(limexRunReports(reports, ctx->callback,
+                            ctx->context, offset)
+                        == MO_HALT_MATCHING)) {
+                DEBUG_PRINTF("callback instructed us to stop\n");
+                return 0; // halt
+            }
+            if (*cacheable == CACHE_RESULT) {
+                if (!new_cache->reports || new_cache->reports == reports) {
+                    new_cache->reports = reports;
+                } else {
+                    *cacheable = DO_NOT_CACHE_RESULT;
+                }
+            }
+        } else {
+            if ((flags & FIRST_BYTE) && *cacheable == CACHE_RESULT) {
+                *cacheable = DO_NOT_CACHE_RESULT;
+            } /* otherwise we can cache as we never care about accepts */
+        }
+    }
+
+    // Most exceptions have a set of successors to switch on. `local_succ' is
+    // ORed into `succ' at the end of the caller's loop.
+#ifndef BIG_MODEL
+    *local_succ = OR_STATE(*local_succ, LOAD_FROM_ENG(&e->successors));
+#else
+    ctx->local_succ = OR_STATE(ctx->local_succ, LOAD_FROM_ENG(&e->successors));
+#endif
+
+    // Some exceptions squash states behind them. Note that we squash states in
+    // 'succ', not local_succ.
+    if (e->hasSquash == LIMEX_SQUASH_CYCLIC
+        || e->hasSquash == LIMEX_SQUASH_REPORT) {
+        *succ = AND_STATE(*succ, LOAD_FROM_ENG(&e->squash));
+        if (*cacheable == CACHE_RESULT) {
+            *cacheable = DO_NOT_CACHE_RESULT;
+        }
+    }
+
+    return 1; // continue
+}
+
+#ifndef RUN_EXCEPTION_FN_ONLY
+
+/** \brief Process all of the exceptions associated with the states in the \a
+ * estate. */
+static really_inline
+int PE_FN(STATE_ARG, ESTATE_ARG, UNUSED u32 diffmask, STATE_T *succ,
+          const struct IMPL_NFA_T *limex, const EXCEPTION_T *exceptions,
+          u64a offset, struct CONTEXT_T *ctx, char in_rev, char flags) {
+    assert(diffmask > 0); // guaranteed by caller macro
+
+    if (EQ_STATE(estate, ctx->cached_estate)) {
+        DEBUG_PRINTF("using cached succ from previous state\n");
+        *succ = OR_STATE(*succ, ctx->cached_esucc);
+        if (ctx->cached_reports && (flags & CALLBACK_OUTPUT)) {
+            DEBUG_PRINTF("firing cached reports from previous state\n");
+            if (unlikely(limexRunReports(ctx->cached_reports, ctx->callback,
+                                         ctx->context, offset)
+                        == MO_HALT_MATCHING)) {
+                return PE_RV_HALT; // halt;
+            }
+        }
+        return 0;
+    }
+
+#ifndef BIG_MODEL
+    STATE_T local_succ = ZERO_STATE;
+#else
+    ctx->local_succ = ZERO_STATE;
+#endif
+
+    struct proto_cache new_cache = {0, NULL};
+    enum CacheResult cacheable = CACHE_RESULT;
+
+#if defined(HAVE_AVX512VBMI) && SIZE > 64
+    if (likely(limex->flags & LIMEX_FLAG_EXTRACT_EXP)) {
+        m512 emask = EXPAND_STATE(*STATE_ARG_P);
+        emask = SHUFFLE_BYTE_STATE(load_m512(&limex->exceptionShufMask), emask);
+        emask = and512(emask, load_m512(&limex->exceptionAndMask));
+        u64a word = eq512mask(emask, load_m512(&limex->exceptionBitMask));
+
+        do {
+            u32 bit = FIND_AND_CLEAR_FN(&word);
+            const EXCEPTION_T *e = &exceptions[bit];
+
+            if (!RUN_EXCEPTION_FN(e, STATE_ARG_NAME, succ,
+#ifndef BIG_MODEL
+                                  &local_succ,
+#endif
+                                  limex, offset, ctx, &new_cache, &cacheable,
+                                  in_rev, flags)) {
+                return PE_RV_HALT;
+            }
+        } while (word);
+    } else {
+        // A copy of the estate as an array of GPR-sized chunks.
+        CHUNK_T chunks[sizeof(STATE_T) / sizeof(CHUNK_T)];
+        CHUNK_T emask_chunks[sizeof(STATE_T) / sizeof(CHUNK_T)];
+#ifdef ESTATE_ON_STACK
+        memcpy(chunks, &estate, sizeof(STATE_T));
+#else
+        memcpy(chunks, estatep, sizeof(STATE_T));
+#endif
+        memcpy(emask_chunks, &limex->exceptionMask, sizeof(STATE_T));
+
+        u32 base_index[sizeof(STATE_T) / sizeof(CHUNK_T)];
+        base_index[0] = 0;
+        for (s32 i = 0; i < (s32)ARRAY_LENGTH(base_index) - 1; i++) {
+            base_index[i + 1] = base_index[i] + POPCOUNT_FN(emask_chunks[i]);
+        }
+
+        do {
+            u32 t = findAndClearLSB_32(&diffmask);
+#ifdef ARCH_64_BIT
+            t >>= 1; // Due to diffmask64, which leaves holes in the bitmask.
+#endif
+            assert(t < ARRAY_LENGTH(chunks));
+            CHUNK_T word = chunks[t];
+            assert(word != 0);
+            do {
+                u32 bit = FIND_AND_CLEAR_FN(&word);
+                u32 local_index = RANK_IN_MASK_FN(emask_chunks[t], bit);
+                u32 idx = local_index + base_index[t];
+                const EXCEPTION_T *e = &exceptions[idx];
+
+                if (!RUN_EXCEPTION_FN(e, STATE_ARG_NAME, succ,
+#ifndef BIG_MODEL
+                                      &local_succ,
+#endif
+                                      limex, offset, ctx, &new_cache, &cacheable,
+                                      in_rev, flags)) {
+                    return PE_RV_HALT;
+                }
+            } while (word);
+        } while (diffmask);
+    }
+#else
+    // A copy of the estate as an array of GPR-sized chunks.
+    CHUNK_T chunks[sizeof(STATE_T) / sizeof(CHUNK_T)];
+    CHUNK_T emask_chunks[sizeof(STATE_T) / sizeof(CHUNK_T)];
+#ifdef ESTATE_ON_STACK
+    memcpy(chunks, &estate, sizeof(STATE_T));
+#else
+    memcpy(chunks, estatep, sizeof(STATE_T));
+#endif
+    memcpy(emask_chunks, &limex->exceptionMask, sizeof(STATE_T));
+
+    u32 base_index[sizeof(STATE_T) / sizeof(CHUNK_T)];
+    base_index[0] = 0;
+    for (s32 i = 0; i < (s32)ARRAY_LENGTH(base_index) - 1; i++) {
+        base_index[i + 1] = base_index[i] + POPCOUNT_FN(emask_chunks[i]);
+    }
+
+    do {
+        u32 t = findAndClearLSB_32(&diffmask);
+#ifdef ARCH_64_BIT
+        t >>= 1; // Due to diffmask64, which leaves holes in the bitmask.
+#endif
+        assert(t < ARRAY_LENGTH(chunks));
+        CHUNK_T word = chunks[t];
+        assert(word != 0);
+        do {
+            u32 bit = FIND_AND_CLEAR_FN(&word);
+            u32 local_index = RANK_IN_MASK_FN(emask_chunks[t], bit);
+            u32 idx = local_index + base_index[t];
+            const EXCEPTION_T *e = &exceptions[idx];
+
+            if (!RUN_EXCEPTION_FN(e, STATE_ARG_NAME, succ,
+#ifndef BIG_MODEL
+                                  &local_succ,
+#endif
+                                  limex, offset, ctx, &new_cache, &cacheable,
+                                  in_rev, flags)) {
+                return PE_RV_HALT;
+            }
+        } while (word);
+    } while (diffmask);
+#endif
+
+#ifndef BIG_MODEL
+    *succ = OR_STATE(*succ, local_succ);
+#else
+    *succ = OR_STATE(*succ, ctx->local_succ);
+#endif
+
+    if (cacheable == CACHE_RESULT) {
+        ctx->cached_estate = estate;
+#ifndef BIG_MODEL
+        ctx->cached_esucc = local_succ;
+#else
+        ctx->cached_esucc = ctx->local_succ;
+#endif
+        ctx->cached_reports = new_cache.reports;
+        ctx->cached_br = new_cache.br;
+    } else if (cacheable == DO_NOT_CACHE_RESULT_AND_FLUSH_BR_ENTRIES) {
+        if (ctx->cached_br) {
+            ctx->cached_estate = ZERO_STATE;
+        }
+    }
+
+    return 0;
+}
+
+#endif
+
+#undef ZERO_STATE
+#undef AND_STATE
+#undef EQ_STATE
+#undef OR_STATE
+#undef EXPAND_STATE
+#undef SHUFFLE_BYTE_STATE
+#undef TESTBIT_STATE
+#undef PE_FN
+#undef RUN_EXCEPTION_FN
+#undef CONTEXT_T
+#undef EXCEPTION_T
+
+#ifdef estate
+#undef estate
+#endif
+
+#ifdef BIG_MODEL
+#undef BIG_MODEL
+#endif
+
+#undef STATE_ARG
+#undef STATE_ARG_NAME
+#undef STATE_ARG_P
+
+#undef IMPL_NFA_T
+
+#undef CHUNK_T
+#undef FIND_AND_CLEAR_FN
+#undef POPCOUNT_FN
+#undef RANK_IN_MASK_FN
diff --git a/regex/nfa/limex_internal.h b/regex/nfa/limex_internal.h
new file mode 100644
index 000000000..23b1bd970
--- /dev/null
+++ b/regex/nfa/limex_internal.h
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2015-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+    This file provides the internal structures and definitions required for the
+    real NFAs (aka limex NFAs );
+
+    Limex NFAs now have variable length in memory. They look like this:
+
+        LimExNFA structure
+            Fixed length, e.g. LimExNFA256.
+        Reachability table
+            Variable length array of state bitvectors, mapped into by
+            NFACommonXXX.reachMap.
+        Tops
+            Variable length array of state bitvectors, used for TOP_N events.
+        Acceleration structures
+            Variable length array of AccelAux structs.
+        Accepts
+            Variable length array of NFAAccept structs.
+        EOD Accepts
+            Variable length array of NFAAccept structs.
+        Exceptions
+            Variable length array of NFAExceptionXXX structs.
+        Repeat Structure Offsets
+            Array of u32 offsets that point at each "Repeat Structure" (below)
+        Repeat Structures
+            Variable length repeat structures, addressed via
+            NFAException32::repeatOffset etc.
+
+    The state associated with the NFA is split into:
+
+    -# The "traditional" NFA state as a bitvector. This is stored in the
+       first N bytes of the state space (length given in
+       NFACommonXXX.stateSize), and may be stored shrunk to CEIL(stateSize/8)
+       or compressed. If it is stored compressed, than the
+       LIMEX_FLAG_COMPRESS_STATE flag is set in NFACommonXXX.flags.
+    -# Extended NFA state, only used in some LimEx NFAs. This consists of a
+       variable length array of LimExNFAExtendedState structures, each with
+       pointers to a packed list of mmbit structures that follows them. Only
+       present when used.
+
+    The value of NFA.stateSize gives the total state size in bytes (the sum of
+    all the above).
+
+    Number of shifts should be always greater or equal to 1
+    Number of shifts 0 means that no appropriate NFA engine was found.
+
+*/
+
+#ifndef LIMEX_INTERNAL_H
+#define LIMEX_INTERNAL_H
+
+#include "nfa_internal.h"
+#include "repeat_internal.h"
+
+// Constants
+#define MAX_SHIFT_COUNT 8   /**< largest number of shifts used by a LimEx NFA */
+#define MAX_SHIFT_AMOUNT 16 /**< largest shift amount used by a LimEx NFA */
+
+#define LIMEX_FLAG_COMPRESS_STATE  1 /**< pack state into stream state */
+#define LIMEX_FLAG_COMPRESS_MASKED 2 /**< use reach mask-based compression */
+#define LIMEX_FLAG_CANNOT_DIE      4 /**< limex cannot have no states on */
+#define LIMEX_FLAG_EXTRACT_EXP     8 /**< use limex exception bit extraction */
+
+enum LimExTrigger {
+    LIMEX_TRIGGER_NONE = 0,
+    LIMEX_TRIGGER_POS = 1,
+    LIMEX_TRIGGER_TUG = 2
+};
+
+enum LimExSquash {
+    LIMEX_SQUASH_NONE = 0,   //!< no squash for you!
+    LIMEX_SQUASH_CYCLIC = 1, //!< squash due to cyclic state
+    LIMEX_SQUASH_TUG = 2,    //!< squash due to tug trigger with stale estate
+    LIMEX_SQUASH_REPORT = 3  //!< squash when report is raised
+};
+
+/* uniform looking types for the macros */
+typedef u8   u_8;
+typedef u16  u_16;
+typedef u32  u_32;
+typedef u64a u_64;
+typedef m128 u_128;
+typedef m256 u_256;
+typedef m384 u_384;
+typedef m512 u_512;
+
+#define CREATE_NFA_LIMEX(size)                                              \
+struct NFAException##size {                                                 \
+    u_##size squash; /**< mask of states to leave on */                     \
+    u_##size successors; /**< mask of states to switch on */                \
+    u32 reports; /**< offset to start of reports list, or MO_INVALID_IDX */ \
+    u32 repeatOffset; /**< offset to NFARepeatInfo, or MO_INVALID_IDX */    \
+    u8 hasSquash; /**< from enum LimExSquash */                             \
+    u8 trigger; /**< from enum LimExTrigger */                              \
+};                                                                          \
+                                                                            \
+struct LimExNFA##size {                                                     \
+    u8 reachMap[N_CHARS]; /**< map of char -> entry in reach[] */           \
+    u32 reachSize; /**< number of reach masks */                            \
+    u32 accelCount; /**< number of entries in accel table */                \
+    u32 accelTableOffset; /* rel. to start of LimExNFA */                   \
+    u32 accelAuxCount; /**< number of entries in aux table */               \
+    u32 accelAuxOffset; /* rel. to start of LimExNFA */                     \
+    u32 acceptCount;                                                        \
+    u32 acceptOffset; /* rel. to start of LimExNFA */                       \
+    u32 acceptEodCount;                                                     \
+    u32 acceptEodOffset; /* rel. to start of LimExNFA */                    \
+    u32 exceptionCount;                                                     \
+    u32 exceptionOffset; /* rel. to start of LimExNFA */                    \
+    u32 repeatCount;                                                        \
+    u32 repeatOffset;                                                       \
+    u32 squashOffset; /* rel. to start of LimExNFA; for accept squashing */ \
+    u32 squashCount;                                                        \
+    u32 topCount;                                                           \
+    u32 topOffset; /* rel. to start of LimExNFA */                          \
+    u32 stateSize; /**< not including extended history */                   \
+    u32 flags;                                                              \
+    u_##size init;                                                          \
+    u_##size initDS;                                                        \
+    u_##size accept; /**< mask of accept states */                          \
+    u_##size acceptAtEOD; /**< mask of states that accept at EOD */         \
+    u_##size accel; /**< mask of accelerable states */                      \
+    u_##size accelPermute; /**< pshufb permute mask (not GPR) */            \
+    u_##size accelCompare; /**< pshufb compare mask (not GPR) */            \
+    u_##size accel_and_friends; /**< mask of accelerable states + likely
+                                    *  followers */                         \
+    u_##size compressMask; /**< switch off before compress */               \
+    u_##size exceptionMask;                                                 \
+    u_##size repeatCyclicMask; /**< also includes tug states */             \
+    u_##size zombieMask; /**< zombie if in any of the set states */         \
+    u_##size shift[MAX_SHIFT_COUNT];                                        \
+    u32 shiftCount; /**< number of shift masks used */                      \
+    u8 shiftAmount[MAX_SHIFT_COUNT]; /**< shift amount for each mask */     \
+    m512 exceptionShufMask; /**< exception byte shuffle mask  */            \
+    m512 exceptionBitMask; /**< exception bit mask */                       \
+    m512 exceptionAndMask; /**< exception and mask */                       \
+};
+
+CREATE_NFA_LIMEX(32)
+CREATE_NFA_LIMEX(64)
+CREATE_NFA_LIMEX(128)
+CREATE_NFA_LIMEX(256)
+CREATE_NFA_LIMEX(384)
+CREATE_NFA_LIMEX(512)
+
+/** \brief Structure describing a bounded repeat within the LimEx NFA.
+ *
+ * This struct is followed in memory by:
+ *
+ * -# a RepeatInfo structure
+ * -# a variable-sized lookup table for REPEAT_SPARSE_OPTIMAL_P repeats
+ * -# a TUG mask
+ */
+struct NFARepeatInfo {
+    u32 cyclicState;      //!< index of this repeat's cyclic state
+    u32 ctrlIndex;        //!< index of this repeat's control block
+    u32 packedCtrlOffset; //!< offset to packed control block in stream state
+    u32 stateOffset;      //!< offset to repeat state in stream state
+    u32 stateSize;        //!< total size of packed stream state for this repeat
+    u32 tugMaskOffset;    //!< offset to tug mask (rel. to NFARepeatInfo)
+};
+
+struct NFAAccept {
+    u8 single_report; //!< If true, 'reports' is report id.
+
+    /**
+     * \brief If single report is true, this is the report id to fire.
+     * Otherwise, it is the offset (relative to the start of the LimExNFA
+     * structure) of a list of reports, terminated with MO_INVALID_IDX.
+     */
+    u32 reports;
+
+    u32 squash;  //!< Offset (from LimEx) into squash masks, or MO_INVALID_IDX.
+};
+
+#endif
diff --git a/regex/nfa/limex_limits.h b/regex/nfa/limex_limits.h
new file mode 100644
index 000000000..f4df54a4b
--- /dev/null
+++ b/regex/nfa/limex_limits.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef LIMEX_LIMITS_H
+#define LIMEX_LIMITS_H
+
+#define NFA_MAX_STATES       512 /**< max states in an NFA */
+#define NFA_MAX_ACCEL_STATES   8 /**< max accel states in a NFA */
+
+#endif
diff --git a/regex/nfa/limex_native.c b/regex/nfa/limex_native.c
new file mode 100644
index 000000000..f6f5809c3
--- /dev/null
+++ b/regex/nfa/limex_native.c
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief LimEx NFA: native GPR runtime implementations.
+ */
+
+//#define DEBUG
+//#define DEBUG_INPUT
+//#define DEBUG_EXCEPTIONS
+
+#include "limex.h"
+
+#include "accel.h"
+#include "limex_internal.h"
+#include "nfa_internal.h"
+#include "ue2common.h"
+#include "util/bitutils.h"
+
+// Common code
+#define STATE_ON_STACK
+#define ESTATE_ON_STACK
+
+#include "limex_runtime.h"
+
+// Other implementation code from X-Macro impl.
+#define SIZE          32
+#define STATE_T       u32
+#define ENG_STATE_T   u32
+#define LOAD_FROM_ENG load_u32
+
+#include "limex_state_impl.h"
+
+#define INLINE_ATTR really_inline
+#include "limex_common_impl.h"
+
+////////////////////////////////////////////////////////////////////////////
+// LimEx NFA implementation code - general purpose registers
+////////////////////////////////////////////////////////////////////////////
+
+// Process exceptional states
+
+#define STATE_ON_STACK
+#define ESTATE_ON_STACK
+#define RUN_EXCEPTION_FN_ONLY
+#include "limex_exceptional.h"
+
+static really_inline
+int processExceptional32(u32 s, u32 estate, UNUSED u32 diffmask, u32 *succ,
+                         const struct LimExNFA32 *limex,
+                         const struct NFAException32 *exceptions, u64a offset,
+                         struct NFAContext32 *ctx, char in_rev, char flags) {
+    assert(estate != 0); // guaranteed by calling macro
+
+    if (estate == ctx->cached_estate) {
+        DEBUG_PRINTF("using cached succ from previous state\n");
+        *succ |= ctx->cached_esucc;
+        if (ctx->cached_reports && (flags & CALLBACK_OUTPUT)) {
+            DEBUG_PRINTF("firing cached reports from previous state\n");
+            if (unlikely(limexRunReports(ctx->cached_reports, ctx->callback,
+                                         ctx->context, offset)
+                        == MO_HALT_MATCHING)) {
+                return PE_RV_HALT; // halt;
+            }
+        }
+        return 0;
+    }
+
+    u32 orig_estate = estate; // for caching
+    u32 local_succ = 0;
+    struct proto_cache new_cache = {0, NULL};
+    enum CacheResult cacheable = CACHE_RESULT;
+
+    /* Note that only exception-states that consist of exceptions that _only_
+     * set successors (not fire accepts or squash states) are cacheable. */
+
+    do {
+        u32 bit = findAndClearLSB_32(&estate);
+        u32 idx = rank_in_mask32(limex->exceptionMask, bit);
+        const struct NFAException32 *e = &exceptions[idx];
+        if (!runException32(e, s, succ, &local_succ, limex, offset, ctx,
+                            &new_cache, &cacheable, in_rev, flags)) {
+            return PE_RV_HALT;
+        }
+    } while (estate != 0);
+
+    *succ |= local_succ;
+
+    if (cacheable == CACHE_RESULT) {
+        ctx->cached_estate = orig_estate;
+        ctx->cached_esucc = local_succ;
+        ctx->cached_reports = new_cache.reports;
+        ctx->cached_br = new_cache.br;
+    } else if (cacheable == DO_NOT_CACHE_RESULT_AND_FLUSH_BR_ENTRIES) {
+        if (ctx->cached_br) {
+            ctx->cached_estate = 0U;
+        }
+    }
+
+    return 0;
+}
+
+// 32-bit models.
+#include "limex_runtime_impl.h"
diff --git a/regex/nfa/limex_ring.h b/regex/nfa/limex_ring.h
new file mode 100644
index 000000000..522cfa12b
--- /dev/null
+++ b/regex/nfa/limex_ring.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Bounded Repeat implementation for the LimEx NFA.
+ */
+
+#ifndef LIMEX_RING_H
+#define LIMEX_RING_H
+
+#include "ue2common.h"
+#include "repeat.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/** \brief Return values from \ref processTugTrigger, used to provide feedback
+ * about a bounded repeat to the caller.
+ *
+ * TRIGGER_FAIL does not get cached as we prefer to use TRIGGER_STALE which
+ * allows the exception to squash the cyclic state as well. */
+enum TriggerResult {
+    TRIGGER_FAIL,    /**< no valid matches, but history still valid */
+    TRIGGER_SUCCESS, /**< valid match found */
+    TRIGGER_STALE,   /**< no valid matches and history is invalid (stale) */
+    TRIGGER_SUCCESS_CACHE /**< valid match found; can cache as the repeat has no
+                             upper bound. */
+};
+
+/** \brief Handle a TUG trigger: given an \p offset, returns whether a repeat
+ * matches or not. */
+static really_inline
+enum TriggerResult processTugTrigger(const struct RepeatInfo *info,
+                                     const union RepeatControl *ctrl,
+                                     const char *state, u64a offset) {
+    DEBUG_PRINTF("tug trigger, %s history, repeat={%u,%u}, offset=%llu, "
+                 "ctrl=%p, state=%p\n",
+                 repeatTypeName(info->type), info->repeatMin, info->repeatMax,
+                 offset, ctrl, state);
+
+    assert(ISALIGNED(ctrl));
+
+    enum RepeatMatch rv = repeatHasMatch(info, ctrl, state, offset);
+    switch (rv) {
+    case REPEAT_NOMATCH:
+        return TRIGGER_FAIL;
+    case REPEAT_STALE:
+        return TRIGGER_STALE;
+    case REPEAT_MATCH:
+        if (info->repeatMax == REPEAT_INF) {
+            // {N,} repeats can be cached.
+            return TRIGGER_SUCCESS_CACHE;
+        } else {
+            return TRIGGER_SUCCESS;
+        }
+    }
+
+    assert(0); // unreachable
+    return TRIGGER_FAIL;
+}
+
+/** \brief Handle a POS trigger: stores a top in the repeat. */
+static really_inline
+void processPosTrigger(const struct RepeatInfo *info, union RepeatControl *ctrl,
+                       char *state, u64a offset, char is_alive) {
+    DEBUG_PRINTF("pos trigger, %s history, repeat={%u,%u}, offset=%llu, "
+                 "is_alive=%d\n", repeatTypeName(info->type),
+                 info->repeatMin, info->repeatMax, offset, is_alive);
+
+    assert(ISALIGNED(ctrl));
+
+    repeatStore(info, ctrl, state, offset, is_alive);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/regex/nfa/limex_runtime.h b/regex/nfa/limex_runtime.h
new file mode 100644
index 000000000..6109d382d
--- /dev/null
+++ b/regex/nfa/limex_runtime.h
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+    \brief Limex Execution Engine Or:
+    How I Learned To Stop Worrying And Love The Preprocessor
+
+    This file includes utility functions which do not depend on the size of the
+    state or shift masks directly.
+*/
+
+#ifndef LIMEX_RUNTIME_H
+#define LIMEX_RUNTIME_H
+
+#include "limex_accel.h"
+#include "limex_context.h"
+#include "limex_internal.h"
+#include "nfa_api_util.h"
+#include "nfa_internal.h"
+#include "util/uniform_ops.h"
+
+////////////////////////////////////////////////////////////////////////////
+// LimEx NFA implementation code - common macros
+////////////////////////////////////////////////////////////////////////////
+
+#ifdef DEBUG_INPUT
+#include <ctype.h>
+#define DUMP_INPUT(index) DEBUG_PRINTF("input %p i=%zu: %02hhx (%c)\n", \
+                            &input[index], index, input[index],         \
+                            isprint(input[index]) ? input[index] : ' ')
+#else
+#define DUMP_INPUT(index) do { } while(0)
+#endif
+
+#define NO_OUTPUT       0
+#define CALLBACK_OUTPUT 1
+#define FIRST_BYTE      16
+
+enum CacheResult {
+    DO_NOT_CACHE_RESULT,
+    CACHE_RESULT,
+    DO_NOT_CACHE_RESULT_AND_FLUSH_BR_ENTRIES
+};
+
+struct proto_cache {
+    char br;
+    const ReportID *reports;
+};
+
+#define PE_RV_HALT 1
+
+#ifdef STATE_ON_STACK
+#define pass_state s
+#else
+#define pass_state &s
+#endif
+
+#ifdef ESTATE_ON_STACK
+#define pass_estate estate
+#else
+#define pass_estate &estate
+#endif
+
+static really_inline
+int limexRunReports(const ReportID *reports, NfaCallback callback,
+                    void *context, u64a offset) {
+    assert(reports);
+    assert(callback);
+
+    for (; *reports != MO_INVALID_IDX; ++reports) {
+        DEBUG_PRINTF("firing report for id %u at offset %llu\n",
+                     *reports, offset);
+        int rv = callback(0, offset, *reports, context);
+        if (rv == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING;
+        }
+    }
+    return MO_CONTINUE_MATCHING; // continue
+}
+
+static really_inline
+int limexRunAccept(const char *limex_base, const struct NFAAccept *accept,
+                   NfaCallback callback, void *context, u64a offset) {
+    if (accept->single_report) {
+        const ReportID report = accept->reports;
+        DEBUG_PRINTF("firing single report for id %u at offset %llu\n", report,
+                     offset);
+        return callback(0, offset, report, context);
+    }
+    const ReportID *reports = (const ReportID *)(limex_base + accept->reports);
+    return limexRunReports(reports, callback, context, offset);
+}
+
+static really_inline
+int limexAcceptHasReport(const char *limex_base, const struct NFAAccept *accept,
+                         ReportID report) {
+    if (accept->single_report) {
+        return accept->reports == report;
+    }
+
+    const ReportID *reports = (const ReportID *)(limex_base + accept->reports);
+    assert(*reports != MO_INVALID_IDX);
+    do {
+        if (*reports == report) {
+            return 1;
+        }
+        reports++;
+    } while (*reports != MO_INVALID_IDX);
+
+    return 0;
+}
+
+/** \brief Return a (correctly typed) pointer to the exception table. */
+#define getExceptionTable(exc_type, lim)                                       \
+    ((const exc_type *)((const char *)(lim) + (lim)->exceptionOffset))
+
+/** \brief Return a pointer to the ordinary accepts table. */
+#define getAcceptTable(lim)                                                    \
+    ((const struct NFAAccept *)((const char *)(lim) + (lim)->acceptOffset))
+
+/** \brief Return a pointer to the EOD accepts table. */
+#define getAcceptEodTable(lim)                                                 \
+    ((const struct NFAAccept *)((const char *)(lim) + (lim)->acceptEodOffset))
+
+#define MAKE_GET_NFA_REPEAT_INFO(size)                                         \
+    static really_inline const struct NFARepeatInfo *getNfaRepeatInfo##size(   \
+        const struct LimExNFA##size *limex, unsigned num) {                    \
+        assert(num < limex->repeatCount);                                      \
+                                                                               \
+        const char *base = (const char *)limex;                                \
+        const u32 *repeatOffset = (const u32 *)(base + limex->repeatOffset);   \
+        assert(ISALIGNED(repeatOffset));                                       \
+                                                                               \
+        const struct NFARepeatInfo *info =                                     \
+            (const struct NFARepeatInfo *)(base + repeatOffset[num]);          \
+        assert(ISALIGNED(info));                                               \
+        return info;                                                           \
+    }
+
+MAKE_GET_NFA_REPEAT_INFO(32)
+MAKE_GET_NFA_REPEAT_INFO(64)
+MAKE_GET_NFA_REPEAT_INFO(128)
+MAKE_GET_NFA_REPEAT_INFO(256)
+MAKE_GET_NFA_REPEAT_INFO(384)
+MAKE_GET_NFA_REPEAT_INFO(512)
+
+static really_inline
+const struct RepeatInfo *getRepeatInfo(const struct NFARepeatInfo *info) {
+    const struct RepeatInfo *repeat =
+        (const struct RepeatInfo *)((const char *)info + sizeof(*info));
+    assert(ISALIGNED(repeat));
+    return repeat;
+}
+
+static really_inline
+union RepeatControl *getRepeatControlBase(char *state, size_t nfa_state_size) {
+    union RepeatControl *ctrl_base =
+        (union RepeatControl *)(state +
+                                ROUNDUP_N(nfa_state_size,
+                                          alignof(union RepeatControl)));
+    assert(ISALIGNED(ctrl_base));
+    return ctrl_base;
+}
+
+static really_inline
+const union RepeatControl *getRepeatControlBaseConst(const char *state,
+                                                     size_t nfa_state_size) {
+    const union RepeatControl *ctrl_base =
+        (const union RepeatControl *)(state +
+                                      ROUNDUP_N(nfa_state_size,
+                                                alignof(union RepeatControl)));
+    assert(ISALIGNED(ctrl_base));
+    return ctrl_base;
+}
+
+#endif
diff --git a/regex/nfa/limex_runtime_impl.h b/regex/nfa/limex_runtime_impl.h
new file mode 100644
index 000000000..3b3bc5013
--- /dev/null
+++ b/regex/nfa/limex_runtime_impl.h
@@ -0,0 +1,1079 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "util/join.h"
+#ifndef __KERNEL__
+#include <string.h>
+#else
+#include <linux/string.h>
+#endif
+
+/** \file
+  * \brief Limex Execution Engine Or:
+  * How I Learned To Stop Worrying And Love The Preprocessor
+  *
+  * Version 2.0: now with X-Macros, so you get line numbers in your debugger.
+  */
+
+
+#if !defined(SIZE) || !defined(STATE_T) || !defined(LOAD_FROM_ENG)
+#  error Must define SIZE, STATE_T, LOAD_FROM_ENG in includer.
+#endif
+
+#define LIMEX_API_ROOT   JOIN(nfaExecLimEx, SIZE)
+
+#define IMPL_NFA_T          JOIN(struct LimExNFA, SIZE)
+
+#define TESTEOD_FN          JOIN(moNfaTestEod, SIZE)
+#define INITIAL_FN          JOIN(moNfaInitial, SIZE)
+#define TOP_FN              JOIN(moNfaTop, SIZE)
+#define TOPN_FN             JOIN(moNfaTopN, SIZE)
+#define REPORTCURRENT_FN    JOIN(moNfaReportCurrent, SIZE)
+#define COMPRESS_FN         JOIN(moNfaCompressState, SIZE)
+#define EXPAND_FN           JOIN(moNfaExpandState, SIZE)
+#define COMPRESS_REPEATS_FN JOIN(LIMEX_API_ROOT, _Compress_Repeats)
+#define EXPAND_REPEATS_FN   JOIN(LIMEX_API_ROOT, _Expand_Repeats)
+#define PROCESS_ACCEPTS_FN  JOIN(moProcessAccepts, SIZE)
+#define PROCESS_ACCEPTS_NOSQUASH_FN  JOIN(moProcessAcceptsNoSquash, SIZE)
+#define GET_NFA_REPEAT_INFO_FN JOIN(getNfaRepeatInfo, SIZE)
+#define RUN_ACCEL_FN        JOIN(LIMEX_API_ROOT, _Run_Accel)
+#define RUN_EXCEPTIONS_FN   JOIN(LIMEX_API_ROOT, _Run_Exceptions)
+#define REV_STREAM_FN       JOIN(LIMEX_API_ROOT, _Rev_Stream)
+#define LOOP_NOACCEL_FN     JOIN(LIMEX_API_ROOT, _Loop_No_Accel)
+#define STREAM_FN           JOIN(LIMEX_API_ROOT, _Stream)
+#define STREAMCB_FN         JOIN(LIMEX_API_ROOT, _Stream_CB)
+#define STREAMFIRST_FN      JOIN(LIMEX_API_ROOT, _Stream_First)
+#define STREAMSILENT_FN     JOIN(LIMEX_API_ROOT, _Stream_Silent)
+#define CONTEXT_T           JOIN(NFAContext, SIZE)
+#define EXCEPTION_T         JOIN(struct NFAException, SIZE)
+#define AND_STATE           JOIN(and_, STATE_T)
+#define ANDNOT_STATE        JOIN(andnot_, STATE_T)
+#define OR_STATE            JOIN(or_, STATE_T)
+#define LSHIFT_STATE        JOIN(lshift_, STATE_T)
+#define TESTBIT_STATE       JOIN(testbit_, STATE_T)
+#define CLEARBIT_STATE      JOIN(clearbit_, STATE_T)
+#define ZERO_STATE          JOIN(zero_, STATE_T)
+#define ISNONZERO_STATE     JOIN(isNonZero_, STATE_T)
+#define ISZERO_STATE        JOIN(isZero_, STATE_T)
+#define NOTEQ_STATE         JOIN(noteq_, STATE_T)
+
+// Pick an appropriate diffrich function for this platform.
+#ifdef ARCH_64_BIT
+#define DIFFRICH_STATE JOIN(diffrich64_, STATE_T)
+#else
+#define DIFFRICH_STATE JOIN(diffrich_, STATE_T)
+#endif
+
+#define EXPIRE_ESTATE_FN    JOIN(limexExpireExtendedState, SIZE)
+#define SQUASH_UNTUG_BR_FN  JOIN(lazyTug, SIZE)
+
+// Acceleration and exception masks: we load them on the fly for really big
+// models.
+#if SIZE < 256
+#define ACCEL_MASK              accelMask
+#define ACCEL_AND_FRIENDS_MASK  accel_and_friendsMask
+#define EXCEPTION_MASK          exceptionMask
+#else
+#define ACCEL_MASK              LOAD_FROM_ENG(&limex->accel)
+#define ACCEL_AND_FRIENDS_MASK  LOAD_FROM_ENG(&limex->accel_and_friends)
+#define EXCEPTION_MASK          LOAD_FROM_ENG(&limex->exceptionMask)
+#endif
+
+// Run exception processing, if necessary. Returns 0 if scanning should
+// continue, 1 if an accept was fired and the user instructed us to halt.
+static really_inline
+char RUN_EXCEPTIONS_FN(const IMPL_NFA_T *limex, const EXCEPTION_T *exceptions,
+                       STATE_T s, const STATE_T emask, size_t i, u64a offset,
+                       STATE_T *succ, u64a *final_loc, struct CONTEXT_T *ctx,
+                       const char flags, const char in_rev,
+                       const char first_match) {
+    STATE_T estate = AND_STATE(s, emask);
+    u32 diffmask = DIFFRICH_STATE(ZERO_STATE, estate);
+    if (likely(!diffmask)) {
+        return 0; // No exceptions to process.
+    }
+
+    if (first_match && i) {
+        STATE_T acceptMask = LOAD_FROM_ENG(&limex->accept);
+        STATE_T foundAccepts = AND_STATE(s, acceptMask);
+        if (unlikely(ISNONZERO_STATE(foundAccepts))) {
+            DEBUG_PRINTF("first match at %zu\n", i);
+            DEBUG_PRINTF("for nfa %p\n", limex);
+            assert(final_loc);
+            ctx->s = s;
+            *final_loc = i;
+            return 1; // Halt matching.
+        }
+    }
+
+    u64a callback_offset = i + offset;
+    char localflags = (!i && !in_rev) ? NO_OUTPUT | FIRST_BYTE : flags;
+
+    int rv = JOIN(processExceptional, SIZE)(
+        pass_state, pass_estate, diffmask, succ, limex, exceptions,
+        callback_offset, ctx, in_rev, localflags);
+    if (rv == PE_RV_HALT) {
+        return 1; // Halt matching.
+    }
+
+    return 0;
+}
+
+static really_inline
+size_t RUN_ACCEL_FN(const STATE_T s, UNUSED const STATE_T accelMask,
+                    UNUSED const IMPL_NFA_T *limex, const u8 *accelTable,
+                    const union AccelAux *accelAux, const u8 *input, size_t i,
+                    size_t length) {
+    size_t j;
+#if SIZE < 128
+    // For small cases, we pass the state by value.
+    j = JOIN(doAccel, SIZE)(s, accelMask, accelTable, accelAux, input, i,
+                            length);
+#else
+    j = JOIN(doAccel, SIZE)(&s, limex, accelTable, accelAux, input, i, length);
+#endif
+
+    assert(j >= i);
+    assert(i <= length);
+    return j;
+}
+
+// Shift macros for Limited NFAs. Defined in terms of uniform ops.
+// LimExNFAxxx ptr in 'limex' and the current state in 's'
+#define NFA_EXEC_LIM_SHIFT(limex_m, curr_m, shift_idx)                         \
+    LSHIFT_STATE(AND_STATE(curr_m, LOAD_FROM_ENG(&limex_m->shift[shift_idx])), \
+                 limex_m->shiftAmount[shift_idx])
+
+// Calculate the (limited model) successors for a number of variable shifts.
+// Assumes current state in 'curr_m' and places the successors in 'succ_m'.
+#define NFA_EXEC_GET_LIM_SUCC(limex_m, curr_m, succ_m)                         \
+    do {                                                                       \
+        succ_m = NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 0);                       \
+        switch (limex_m->shiftCount) {                                         \
+        case 8:                                                                \
+            succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 7)); \
+            FALLTHROUGH;                                                       \
+        case 7:                                                                \
+            succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 6)); \
+            FALLTHROUGH;                                                       \
+        case 6:                                                                \
+            succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 5)); \
+            FALLTHROUGH;                                                       \
+        case 5:                                                                \
+            succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 4)); \
+            FALLTHROUGH;                                                       \
+        case 4:                                                                \
+            succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 3)); \
+            FALLTHROUGH;                                                       \
+        case 3:                                                                \
+            succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 2)); \
+            FALLTHROUGH;                                                       \
+        case 2:                                                                \
+            succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 1)); \
+            FALLTHROUGH;                                                       \
+        case 1:                                                                \
+            FALLTHROUGH;                                                       \
+        case 0:                                                                \
+            ;                                                                  \
+        }                                                                      \
+    } while (0)
+
+/**
+ * \brief LimEx NFAS inner loop without accel.
+ *
+ * Note that the "all zeroes" early death check is only performed if can_die is
+ * true.
+ *
+ */
+static really_inline
+char LOOP_NOACCEL_FN(const IMPL_NFA_T *limex, const u8 *input, size_t *loc,
+                     size_t length, STATE_T *s_ptr, struct CONTEXT_T *ctx,
+                     u64a offset, const char flags, u64a *final_loc,
+                     const char first_match, const char can_die) {
+    const ENG_STATE_T *reach = get_reach_table(limex);
+#if SIZE < 256
+    const STATE_T exceptionMask = LOAD_FROM_ENG(&limex->exceptionMask);
+#endif
+    const EXCEPTION_T *exceptions = getExceptionTable(EXCEPTION_T, limex);
+    STATE_T s = *s_ptr;
+
+    size_t i = *loc;
+    for (; i != length; i++) {
+        DUMP_INPUT(i);
+        if (can_die && ISZERO_STATE(s)) {
+            DEBUG_PRINTF("no states are switched on, early exit\n");
+            break;
+        }
+
+        STATE_T succ;
+        NFA_EXEC_GET_LIM_SUCC(limex, s, succ);
+
+        if (RUN_EXCEPTIONS_FN(limex, exceptions, s, EXCEPTION_MASK, i, offset,
+                              &succ, final_loc, ctx, flags, 0, first_match)) {
+            return MO_HALT_MATCHING;
+        }
+
+        u8 c = input[i];
+        s = AND_STATE(succ, LOAD_FROM_ENG(&reach[limex->reachMap[c]]));
+    }
+
+    *loc = i;
+    *s_ptr = s;
+    return MO_CONTINUE_MATCHING;
+}
+
+static really_inline
+char STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
+               struct CONTEXT_T *ctx, u64a offset, const char flags,
+               u64a *final_loc, const char first_match) {
+    const ENG_STATE_T *reach = get_reach_table(limex);
+#if SIZE < 256
+    const STATE_T accelMask = LOAD_FROM_ENG(&limex->accel);
+    const STATE_T accel_and_friendsMask
+        = LOAD_FROM_ENG(&limex->accel_and_friends);
+    const STATE_T exceptionMask = LOAD_FROM_ENG(&limex->exceptionMask);
+#endif
+    const u8 *accelTable =
+        (const u8 *)((const char *)limex + limex->accelTableOffset);
+    const union AccelAux *accelAux =
+        (const union AccelAux *)((const char *)limex + limex->accelAuxOffset);
+    const EXCEPTION_T *exceptions = getExceptionTable(EXCEPTION_T, limex);
+    STATE_T s = ctx->s;
+
+    /* assert(ISALIGNED_16(exceptions)); */
+    /* assert(ISALIGNED_16(reach)); */
+
+    size_t i = 0;
+    size_t min_accel_offset = 0;
+    if (!limex->accelCount || length < ACCEL_MIN_LEN) {
+        min_accel_offset = length;
+        goto without_accel;
+    } else {
+        goto with_accel;
+    }
+
+without_accel:
+    if (limex->flags & LIMEX_FLAG_CANNOT_DIE) {
+        const char can_die = 0;
+        if (LOOP_NOACCEL_FN(limex, input, &i, min_accel_offset, &s, ctx, offset,
+                            flags, final_loc, first_match,
+                            can_die) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING;
+        }
+    } else {
+        const char can_die = 1;
+        if (LOOP_NOACCEL_FN(limex, input, &i, min_accel_offset, &s, ctx, offset,
+                            flags, final_loc, first_match,
+                            can_die) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING;
+        }
+    }
+
+with_accel:
+    for (; i != length; i++) {
+        DUMP_INPUT(i);
+        if (i + 16 <= length &&
+            ISZERO_STATE(ANDNOT_STATE(ACCEL_AND_FRIENDS_MASK, s))) {
+            DEBUG_PRINTF("current states are all accelerable\n");
+            assert(i + 16 <= length);
+            size_t post_idx =
+                RUN_ACCEL_FN(s, ACCEL_MASK, limex, accelTable, accelAux, input,
+                             i, length);
+            if (post_idx != i) {
+                /* squashing any friends as they may no longer be valid;
+                 * offset back off should ensure they weren't doing anything
+                 * important */
+                s = AND_STATE(ACCEL_MASK, s);
+            }
+
+            if (i && post_idx < min_accel_offset + BAD_ACCEL_DIST) {
+                min_accel_offset = post_idx + BIG_ACCEL_PENALTY;
+            } else {
+                min_accel_offset = post_idx + SMALL_ACCEL_PENALTY;
+            }
+
+            if (min_accel_offset >= length - ACCEL_MIN_LEN) {
+                min_accel_offset = length;
+            }
+
+            DEBUG_PRINTF("advanced %zd, next accel chance in %zd/%zd\n",
+                         post_idx - i, min_accel_offset - post_idx,
+                         length - post_idx);
+
+            i = post_idx;
+            if (i == length) {
+                break; /* all chars eaten, break out of loop */
+            }
+            goto without_accel;
+        }
+
+        STATE_T succ;
+        NFA_EXEC_GET_LIM_SUCC(limex, s, succ);
+
+        if (RUN_EXCEPTIONS_FN(limex, exceptions, s, EXCEPTION_MASK, i, offset,
+                              &succ, final_loc, ctx, flags, 0, first_match)) {
+            return MO_HALT_MATCHING;
+        }
+
+        u8 c = input[i];
+        s = AND_STATE(succ, LOAD_FROM_ENG(&reach[limex->reachMap[c]]));
+    }
+
+    ctx->s = s;
+
+    if ((first_match || (flags & CALLBACK_OUTPUT)) && limex->acceptCount) {
+        STATE_T acceptMask = LOAD_FROM_ENG(&limex->accept);
+        const struct NFAAccept *acceptTable = getAcceptTable(limex);
+        STATE_T foundAccepts = AND_STATE(s, acceptMask);
+        if (unlikely(ISNONZERO_STATE(foundAccepts))) {
+            if (first_match) {
+                ctx->s = s;
+                assert(final_loc);
+                *final_loc = length;
+                return MO_HALT_MATCHING;
+            } else if (PROCESS_ACCEPTS_FN(limex, &ctx->s, &acceptMask,
+                                          acceptTable, offset + length,
+                                          ctx->callback, ctx->context)) {
+                return MO_HALT_MATCHING;
+            }
+        }
+    }
+    if (first_match) {
+        assert(final_loc);
+        *final_loc = length;
+    }
+    return MO_CONTINUE_MATCHING;
+}
+
+static never_inline
+char REV_STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
+                   struct CONTEXT_T *ctx, u64a offset) {
+    const ENG_STATE_T *reach = get_reach_table(limex);
+#if SIZE < 256
+    const STATE_T exceptionMask = LOAD_FROM_ENG(&limex->exceptionMask);
+#endif
+    const EXCEPTION_T *exceptions = getExceptionTable(EXCEPTION_T, limex);
+    STATE_T s = ctx->s;
+
+    /* assert(ISALIGNED_16(exceptions)); */
+    /* assert(ISALIGNED_16(reach)); */
+    const char flags = CALLBACK_OUTPUT;
+    u64a *final_loc = NULL;
+
+    for (size_t i = length; i != 0; i--) {
+        DUMP_INPUT(i - 1);
+        if (ISZERO_STATE(s)) {
+            DEBUG_PRINTF("no states are switched on, early exit\n");
+            ctx->s = s;
+            return MO_CONTINUE_MATCHING;
+        }
+
+        STATE_T succ;
+        NFA_EXEC_GET_LIM_SUCC(limex, s, succ);
+
+        if (RUN_EXCEPTIONS_FN(limex, exceptions, s, EXCEPTION_MASK, i, offset,
+                              &succ, final_loc, ctx, flags, 1, 0)) {
+            return MO_HALT_MATCHING;
+        }
+
+        u8 c = input[i - 1];
+        s = AND_STATE(succ, LOAD_FROM_ENG(&reach[limex->reachMap[c]]));
+    }
+
+    ctx->s = s;
+
+    STATE_T acceptMask = LOAD_FROM_ENG(&limex->accept);
+    const struct NFAAccept *acceptTable = getAcceptTable(limex);
+    const u32 acceptCount = limex->acceptCount;
+    assert(flags & CALLBACK_OUTPUT);
+    if (acceptCount) {
+        STATE_T foundAccepts = AND_STATE(s, acceptMask);
+        if (unlikely(ISNONZERO_STATE(foundAccepts))) {
+            if (PROCESS_ACCEPTS_NOSQUASH_FN(limex, &ctx->s, &acceptMask,
+                                            acceptTable, offset, ctx->callback,
+                                            ctx->context)) {
+                return MO_HALT_MATCHING;
+            }
+        }
+    }
+    return MO_CONTINUE_MATCHING;
+}
+
+static really_inline
+void COMPRESS_REPEATS_FN(const IMPL_NFA_T *limex, void *dest, void *src,
+                         u64a offset) {
+    if (!limex->repeatCount) {
+        return;
+    }
+
+    STATE_T s = *(STATE_T *)src;
+
+    if (ISZERO_STATE(AND_STATE(LOAD_FROM_ENG(&limex->repeatCyclicMask), s))) {
+        DEBUG_PRINTF("no cyclics are on\n");
+        return;
+    }
+
+    const union RepeatControl *ctrl =
+        getRepeatControlBaseConst((const char *)src, sizeof(STATE_T));
+    char *state_base = (char *)dest + limex->stateSize;
+
+    for (u32 i = 0; i < limex->repeatCount; i++) {
+        DEBUG_PRINTF("repeat %u\n", i);
+        const struct NFARepeatInfo *info = GET_NFA_REPEAT_INFO_FN(limex, i);
+
+        const ENG_STATE_T *tug_mask =
+            (const ENG_STATE_T *)((const char *)info + info->tugMaskOffset);
+        /* repeat may still be inspected if its tug state is on */
+        if (!TESTBIT_STATE(s, info->cyclicState)
+            && ISZERO_STATE(AND_STATE(s, LOAD_FROM_ENG(tug_mask)))) {
+            DEBUG_PRINTF("is dead\n");
+            continue;
+        }
+
+        const struct RepeatInfo *repeat = getRepeatInfo(info);
+        DEBUG_PRINTF("packing state (packedCtrlOffset=%u)\n",
+                     info->packedCtrlOffset);
+        repeatPack(state_base + info->packedCtrlOffset, repeat, &ctrl[i],
+                   offset);
+    }
+
+    *(STATE_T *)src = s;
+}
+
+char JOIN(LIMEX_API_ROOT, _queueCompressState)(const struct NFA *n,
+                                               const struct mq *q, s64a loc) {
+    void *dest = q->streamState;
+    void *src = q->state;
+    u8 key = queue_prev_byte(q, loc);
+    const IMPL_NFA_T *limex = getImplNfa(n);
+    COMPRESS_REPEATS_FN(limex, dest, src, q->offset + loc);
+    COMPRESS_FN(limex, dest, src, key);
+    return 0;
+}
+
+static really_inline
+void EXPAND_REPEATS_FN(const IMPL_NFA_T *limex, void *dest, const void *src,
+                       u64a offset) {
+    if (!limex->repeatCount) {
+        return;
+    }
+
+    // Note: state has already been expanded into 'dest'.
+    const STATE_T cyclics =
+        AND_STATE(*(STATE_T *)dest, LOAD_FROM_ENG(&limex->repeatCyclicMask));
+    if (ISZERO_STATE(cyclics)) {
+        DEBUG_PRINTF("no cyclics are on\n");
+        return;
+    }
+
+    union RepeatControl *ctrl =
+        getRepeatControlBase((char *)dest, sizeof(STATE_T));
+    const char *state_base = (const char *)src + limex->stateSize;
+
+    for (u32 i = 0; i < limex->repeatCount; i++) {
+        DEBUG_PRINTF("repeat %u\n", i);
+        const struct NFARepeatInfo *info = GET_NFA_REPEAT_INFO_FN(limex, i);
+        const ENG_STATE_T *tug_mask =
+            (const ENG_STATE_T *)((const char *)info + info->tugMaskOffset);
+
+        if (!TESTBIT_STATE(cyclics, info->cyclicState)
+            && ISZERO_STATE(AND_STATE(cyclics, LOAD_FROM_ENG(tug_mask)))) {
+            DEBUG_PRINTF("is dead\n");
+            continue;
+        }
+
+        DEBUG_PRINTF("unpacking state (packedCtrlOffset=%u)\n",
+                     info->packedCtrlOffset);
+        const struct RepeatInfo *repeat = getRepeatInfo(info);
+        repeatUnpack(state_base + info->packedCtrlOffset, repeat, offset,
+                     &ctrl[i]);
+    }
+}
+
+char JOIN(LIMEX_API_ROOT, _expandState)(const struct NFA *n, void *dest,
+                                           const void *src, u64a offset,
+                                           u8 key) {
+    const IMPL_NFA_T *limex = getImplNfa(n);
+    EXPAND_FN(limex, dest, src, key);
+    EXPAND_REPEATS_FN(limex, dest, src, offset);
+    return 0;
+}
+
+char JOIN(LIMEX_API_ROOT, _queueInitState)(const struct NFA *n, struct mq *q) {
+    *(STATE_T *)q->state = ZERO_STATE;
+
+    // Zero every bounded repeat control block in state.
+    const IMPL_NFA_T *limex = getImplNfa(n);
+    union RepeatControl *ctrl = getRepeatControlBase(q->state, sizeof(STATE_T));
+    for (u32 i = 0; i < limex->repeatCount; i++) {
+        memset(&ctrl[i], 0, sizeof(*ctrl));
+    }
+
+    return 0;
+}
+
+char JOIN(LIMEX_API_ROOT, _initCompressedState)(const struct NFA *n,
+                                                   u64a offset, void *state,
+                                                   u8 key) {
+    const IMPL_NFA_T *limex = getImplNfa(n);
+
+    STATE_T s = INITIAL_FN(limex, !!offset);
+    if (ISZERO_STATE(s)) {
+        DEBUG_PRINTF("state went to zero\n");
+        return 0;
+    }
+
+    // NFA is still active, compress its state and ship it out.
+    COMPRESS_FN(limex, state, &s, key);
+
+    // Zero every packed bounded repeat control block in stream state.
+    char *repeat_region = (char *)state + limex->stateSize;
+    for (u32 i = 0; i < limex->repeatCount; i++) {
+        const struct NFARepeatInfo *info = GET_NFA_REPEAT_INFO_FN(limex, i);
+        const struct RepeatInfo *repeat = getRepeatInfo(info);
+
+        memset(repeat_region + info->packedCtrlOffset, 0,
+               repeat->packedCtrlSize);
+    }
+
+    return 1;
+}
+
+// Helper for history buffer scans, which catch up the NFA state but don't emit
+// matches.
+static never_inline
+void STREAMSILENT_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
+                     struct CONTEXT_T *ctx, u64a offset) {
+    const char first_match = 0;
+
+    UNUSED char rv = STREAM_FN(limex, input, length, ctx, offset, NO_OUTPUT,
+                               NULL, first_match);
+    assert(rv != MO_HALT_MATCHING);
+}
+
+static never_inline
+char STREAMCB_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
+                 struct CONTEXT_T *ctx, u64a offset) {
+    const char first_match = 0;
+    assert(ISALIGNED_CL(ctx));
+    return STREAM_FN(limex, input, length, ctx, offset, CALLBACK_OUTPUT, NULL,
+                     first_match);
+}
+
+static never_inline
+char STREAMFIRST_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
+                    struct CONTEXT_T *ctx, u64a offset, u64a *final_loc) {
+    const char first_match = 1; // Run to first match and stop, no callbacks.
+    return STREAM_FN(limex, input, length, ctx, offset, NO_OUTPUT, final_loc,
+                     first_match);
+}
+
+// Common code for handling the current event on the queue.
+static really_inline
+void JOIN(LIMEX_API_ROOT, _HandleEvent)(const IMPL_NFA_T *limex,
+                                           struct mq *q, struct CONTEXT_T *ctx,
+                                           u64a sp) {
+#define DEFINE_CASE(ee)                                                        \
+    case ee:                                                                   \
+        DEBUG_PRINTF(#ee "\n");
+
+    u32 e = q->items[q->cur].type;
+    switch (e) {
+        DEFINE_CASE(MQE_TOP)
+            ctx->s = TOP_FN(limex, !!sp, ctx->s);
+            break;
+        DEFINE_CASE(MQE_START)
+            break;
+        DEFINE_CASE(MQE_END)
+            break;
+        default:
+            assert(e >= MQE_TOP_FIRST);
+            assert(e < MQE_INVALID);
+            DEBUG_PRINTF("MQE_TOP + %d\n", ((int)e - MQE_TOP_FIRST));
+            ctx->s = TOPN_FN(limex, ctx->s, e - MQE_TOP_FIRST);
+    }
+#undef DEFINE_CASE
+}
+
+// "Classic" queue call, used by outfixes
+char JOIN(LIMEX_API_ROOT, _Q)(const struct NFA *n, struct mq *q, s64a end) {
+    const IMPL_NFA_T *limex = getImplNfa(n);
+
+    if (q->report_current) {
+        char rv = REPORTCURRENT_FN(limex, q);
+
+        q->report_current = 0;
+
+        if (rv == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING;
+        }
+    }
+
+    if (q->cur == q->end) {
+        return 1;
+    }
+
+    assert(q->cur + 1 < q->end); /* require at least two items */
+
+    struct CONTEXT_T ctx;
+    ctx.repeat_ctrl = getRepeatControlBase(q->state, sizeof(STATE_T));
+    ctx.repeat_state = q->streamState + limex->stateSize;
+    ctx.callback = q->cb;
+    ctx.context = q->context;
+    ctx.cached_estate = ZERO_STATE;
+    ctx.cached_br = 0;
+
+    assert(q->items[q->cur].location >= 0);
+    DEBUG_PRINTF("LOAD STATE\n");
+    ctx.s = *(STATE_T *)q->state;
+    assert(q->items[q->cur].type == MQE_START);
+
+    u64a offset = q->offset;
+    u64a sp = offset + q->items[q->cur].location;
+    u64a end_abs = offset + end;
+    q->cur++;
+
+    while (q->cur < q->end && sp <= end_abs) {
+        u64a ep = offset + q->items[q->cur].location;
+        ep = MIN(ep, end_abs);
+        assert(ep >= sp);
+
+        assert(sp >= offset); // We no longer do history buffer scans here.
+
+        if (sp >= ep) {
+            goto scan_done;
+        }
+
+        /* do main buffer region */
+        DEBUG_PRINTF("MAIN BUFFER SCAN\n");
+        assert(ep - offset <= q->length);
+        if (STREAMCB_FN(limex, q->buffer + sp - offset, ep - sp, &ctx, sp)
+                == MO_HALT_MATCHING) {
+            *(STATE_T *)q->state = ZERO_STATE;
+            return 0;
+        }
+
+        DEBUG_PRINTF("SCAN DONE\n");
+    scan_done:
+        sp = ep;
+
+       if (sp != offset + q->items[q->cur].location) {
+           assert(q->cur);
+           DEBUG_PRINTF("bail: sp = %llu end_abs == %llu offset == %llu\n",
+                        sp, end_abs, offset);
+           assert(sp == end_abs);
+           q->cur--;
+           q->items[q->cur].type = MQE_START;
+           q->items[q->cur].location = sp - offset;
+           DEBUG_PRINTF("bailing q->cur %u q->end %u\n", q->cur, q->end);
+           *(STATE_T *)q->state = ctx.s;
+           return MO_ALIVE;
+       }
+
+        JOIN(LIMEX_API_ROOT, _HandleEvent)(limex, q, &ctx, sp);
+
+        q->cur++;
+    }
+
+    EXPIRE_ESTATE_FN(limex, &ctx, sp);
+
+    DEBUG_PRINTF("END\n");
+    *(STATE_T *)q->state = ctx.s;
+
+    if (q->cur != q->end) {
+        q->cur--;
+        q->items[q->cur].type = MQE_START;
+        q->items[q->cur].location = sp - offset;
+        return MO_ALIVE;
+    }
+
+    return ISNONZERO_STATE(ctx.s);
+}
+
+/* used by suffix execution in Rose */
+char JOIN(LIMEX_API_ROOT, _Q2)(const struct NFA *n, struct mq *q, s64a end) {
+    const IMPL_NFA_T *limex = getImplNfa(n);
+
+    if (q->report_current) {
+        char rv = REPORTCURRENT_FN(limex, q);
+
+        q->report_current = 0;
+
+        if (rv == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING;
+        }
+    }
+
+    if (q->cur == q->end) {
+        return 1;
+    }
+
+    assert(q->cur + 1 < q->end); /* require at least two items */
+
+    struct CONTEXT_T ctx;
+    ctx.repeat_ctrl = getRepeatControlBase(q->state, sizeof(STATE_T));
+    ctx.repeat_state = q->streamState + limex->stateSize;
+    ctx.callback = q->cb;
+    ctx.context = q->context;
+    ctx.cached_estate = ZERO_STATE;
+    ctx.cached_br = 0;
+
+    DEBUG_PRINTF("LOAD STATE\n");
+    ctx.s = *(STATE_T *)q->state;
+    assert(q->items[q->cur].type == MQE_START);
+
+    u64a offset = q->offset;
+    u64a sp = offset + q->items[q->cur].location;
+    u64a end_abs = offset + end;
+    q->cur++;
+
+    while (q->cur < q->end && sp <= end_abs) {
+        u64a ep = offset + q->items[q->cur].location;
+        DEBUG_PRINTF("sp = %llu, ep = %llu, end_abs = %llu\n",
+                     sp, ep, end_abs);
+        ep = MIN(ep, end_abs);
+        assert(ep >= sp);
+
+        if (sp < offset) {
+            DEBUG_PRINTF("HISTORY BUFFER SCAN\n");
+            assert(offset - sp <= q->hlength);
+            u64a local_ep = MIN(offset, ep);
+            u64a final_look = 0;
+            /* we are starting inside the history buffer */
+            if (STREAMFIRST_FN(limex, q->history + q->hlength + sp - offset,
+                               local_ep - sp, &ctx, sp,
+                               &final_look) == MO_HALT_MATCHING) {
+                DEBUG_PRINTF("final_look:%llu sp:%llu end_abs:%llu "
+                             "offset:%llu\n", final_look, sp, end_abs, offset);
+                assert(q->cur);
+                q->cur--;
+                q->items[q->cur].type = MQE_START;
+                q->items[q->cur].location = sp + final_look - offset;
+                *(STATE_T *)q->state = ctx.s;
+                return MO_MATCHES_PENDING;
+            }
+
+            sp = local_ep;
+        }
+
+        if (sp >= ep) {
+            goto scan_done;
+        }
+
+        /* do main buffer region */
+        u64a final_look = 0;
+        assert(ep - offset <= q->length);
+        if (STREAMFIRST_FN(limex, q->buffer + sp - offset, ep - sp, &ctx, sp,
+                           &final_look) == MO_HALT_MATCHING) {
+            DEBUG_PRINTF("final_look:%llu sp:%llu end_abs:%llu offset:%llu\n",
+                         final_look, sp, end_abs, offset);
+            assert(q->cur);
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = sp + final_look - offset;
+            *(STATE_T *)q->state = ctx.s;
+            return MO_MATCHES_PENDING;
+        }
+
+    scan_done:
+        sp = ep;
+
+        if (sp != offset + q->items[q->cur].location) {
+            assert(q->cur);
+            DEBUG_PRINTF("bail: sp = %llu end_abs == %llu offset == %llu\n",
+                         sp, end_abs, offset);
+            assert(sp == end_abs);
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = sp - offset;
+            DEBUG_PRINTF("bailing q->cur %u q->end %u\n", q->cur, q->end);
+            *(STATE_T *)q->state = ctx.s;
+            return MO_ALIVE;
+        }
+
+        JOIN(LIMEX_API_ROOT, _HandleEvent)(limex, q, &ctx, sp);
+
+        q->cur++;
+    }
+
+    EXPIRE_ESTATE_FN(limex, &ctx, sp);
+
+    DEBUG_PRINTF("END\n");
+    *(STATE_T *)q->state = ctx.s;
+
+    if (q->cur != q->end) {
+        q->cur--;
+        q->items[q->cur].type = MQE_START;
+        q->items[q->cur].location = sp - offset;
+        return MO_ALIVE;
+    }
+
+    return ISNONZERO_STATE(ctx.s);
+}
+
+// Used for execution Rose prefix/infixes.
+char JOIN(LIMEX_API_ROOT, _QR)(const struct NFA *n, struct mq *q,
+                                  ReportID report) {
+    const IMPL_NFA_T *limex = getImplNfa(n);
+
+    if (q->cur == q->end) {
+        return 1;
+    }
+
+    assert(q->cur + 1 < q->end); /* require at least two items */
+
+    struct CONTEXT_T ctx;
+    ctx.repeat_ctrl = getRepeatControlBase(q->state, sizeof(STATE_T));
+    ctx.repeat_state = q->streamState + limex->stateSize;
+    ctx.callback = NULL;
+    ctx.context = NULL;
+    ctx.cached_estate = ZERO_STATE;
+    ctx.cached_br = 0;
+
+    DEBUG_PRINTF("LOAD STATE\n");
+    ctx.s = *(STATE_T *)q->state;
+    assert(q->items[q->cur].type == MQE_START);
+
+    u64a offset = q->offset;
+    u64a sp = offset + q->items[q->cur].location;
+    q->cur++;
+
+    while (q->cur < q->end) {
+        u64a ep = offset + q->items[q->cur].location;
+        if (n->maxWidth) {
+            if (ep - sp > n->maxWidth) {
+                sp = ep - n->maxWidth;
+                ctx.s = INITIAL_FN(limex, !!sp);
+            }
+        }
+        assert(ep >= sp);
+
+        if (sp < offset) {
+            DEBUG_PRINTF("HISTORY BUFFER SCAN\n");
+            assert(offset - sp <= q->hlength);
+            u64a local_ep = MIN(offset, ep);
+            /* we are starting inside the history buffer */
+            STREAMSILENT_FN(limex, q->history + q->hlength + sp - offset,
+                            local_ep - sp, &ctx, sp);
+
+            sp = local_ep;
+        }
+
+        if (sp >= ep) {
+            goto scan_done;
+        }
+
+        /* do main buffer region */
+        DEBUG_PRINTF("MAIN BUFFER SCAN\n");
+        assert(ep - offset <= q->length);
+        STREAMSILENT_FN(limex, q->buffer + sp - offset, ep - sp, &ctx, sp);
+
+        DEBUG_PRINTF("SCAN DONE\n");
+    scan_done:
+        sp = ep;
+
+        JOIN(LIMEX_API_ROOT, _HandleEvent)(limex, q, &ctx, sp);
+
+        q->cur++;
+    }
+
+    EXPIRE_ESTATE_FN(limex, &ctx, sp);
+
+    DEBUG_PRINTF("END, nfa is %s\n",
+                 ISNONZERO_STATE(ctx.s) ? "still alive" : "dead");
+
+    *(STATE_T *)q->state = ctx.s;
+
+    if (JOIN(limexInAccept, SIZE)(limex, ctx.s, ctx.repeat_ctrl,
+                                  ctx.repeat_state, sp + 1, report)) {
+        return MO_MATCHES_PENDING;
+    }
+
+    return ISNONZERO_STATE(ctx.s);
+}
+
+char JOIN(LIMEX_API_ROOT, _testEOD)(const struct NFA *n, const char *state,
+                                    const char *streamState, u64a offset,
+                                    NfaCallback callback, void *context) {
+    assert(n && state);
+
+    const IMPL_NFA_T *limex = getImplNfa(n);
+    const STATE_T *sptr = (const STATE_T *)state;
+    const union RepeatControl *repeat_ctrl =
+        getRepeatControlBaseConst(state, sizeof(STATE_T));
+    const char *repeat_state = streamState + limex->stateSize;
+    return TESTEOD_FN(limex, sptr, repeat_ctrl, repeat_state, offset, callback,
+                      context);
+}
+
+char JOIN(LIMEX_API_ROOT, _reportCurrent)(const struct NFA *n, struct mq *q) {
+    const IMPL_NFA_T *limex = getImplNfa(n);
+    REPORTCURRENT_FN(limex, q);
+    return 1;
+}
+
+// Block mode reverse scan.
+char JOIN(LIMEX_API_ROOT, _B_Reverse)(const struct NFA *n, u64a offset,
+                                      const u8 *buf, size_t buflen,
+                                      const u8 *hbuf, size_t hlen,
+                                      NfaCallback cb, void *context) {
+    assert(buf || hbuf);
+    assert(buflen || hlen);
+
+    struct CONTEXT_T ctx;
+    ctx.repeat_ctrl = NULL;
+    ctx.repeat_state = NULL;
+    ctx.callback = cb;
+    ctx.context = context;
+    ctx.cached_estate = ZERO_STATE;
+    ctx.cached_br = 0;
+
+    const IMPL_NFA_T *limex = getImplNfa(n);
+    ctx.s = INITIAL_FN(limex, 0); // always anchored
+
+    // 'buf' may be null, for example when we're scanning at EOD time.
+    if (buflen) {
+        assert(buf);
+        DEBUG_PRINTF("MAIN BUFFER SCAN, %zu bytes\n", buflen);
+        offset -= buflen;
+        REV_STREAM_FN(limex, buf, buflen, &ctx, offset);
+    }
+
+    if (hlen) {
+        assert(hbuf);
+        DEBUG_PRINTF("HISTORY BUFFER SCAN, %zu bytes\n", hlen);
+        offset -= hlen;
+        REV_STREAM_FN(limex, hbuf, hlen, &ctx, offset);
+    }
+
+    if (offset == 0 && limex->acceptEodCount && ISNONZERO_STATE(ctx.s)) {
+        const union RepeatControl *repeat_ctrl = NULL;
+        const char *repeat_state = NULL;
+        TESTEOD_FN(limex, &ctx.s, repeat_ctrl, repeat_state, offset, cb,
+                   context);
+    }
+
+    // NOTE: return value is unused.
+    return 0;
+}
+
+char JOIN(LIMEX_API_ROOT, _inAccept)(const struct NFA *nfa,
+                                        ReportID report, struct mq *q) {
+    assert(nfa && q);
+    assert(q->state && q->streamState);
+
+    const IMPL_NFA_T *limex = getImplNfa(nfa);
+    union RepeatControl *repeat_ctrl =
+        getRepeatControlBase(q->state, sizeof(STATE_T));
+    char *repeat_state = q->streamState + limex->stateSize;
+    STATE_T state = *(STATE_T *)q->state;
+    u64a offset = q->offset + q_last_loc(q) + 1;
+
+    return JOIN(limexInAccept, SIZE)(limex, state, repeat_ctrl, repeat_state,
+                                     offset, report);
+}
+
+char JOIN(LIMEX_API_ROOT, _inAnyAccept)(const struct NFA *nfa, struct mq *q) {
+    assert(nfa && q);
+    assert(q->state && q->streamState);
+
+    const IMPL_NFA_T *limex = getImplNfa(nfa);
+    union RepeatControl *repeat_ctrl =
+        getRepeatControlBase(q->state, sizeof(STATE_T));
+    char *repeat_state = q->streamState + limex->stateSize;
+    STATE_T state = *(STATE_T *)q->state;
+    u64a offset = q->offset + q_last_loc(q) + 1;
+
+    return JOIN(limexInAnyAccept, SIZE)(limex, state, repeat_ctrl, repeat_state,
+                                        offset);
+}
+
+enum nfa_zombie_status JOIN(LIMEX_API_ROOT, _zombie_status)(
+                                                         const struct NFA *nfa,
+                                                         struct mq *q,
+                                                         s64a loc) {
+    assert(nfa->flags & NFA_ZOMBIE);
+    const IMPL_NFA_T *limex = getImplNfa(nfa);
+    STATE_T state = *(STATE_T *)q->state;
+    STATE_T zmask = LOAD_FROM_ENG(&limex->zombieMask);
+
+    if (limex->repeatCount) {
+        u64a offset = q->offset + loc + 1;
+        union RepeatControl *repeat_ctrl =
+            getRepeatControlBase(q->state, sizeof(STATE_T));
+        char *repeat_state = q->streamState + limex->stateSize;
+        SQUASH_UNTUG_BR_FN(limex, repeat_ctrl, repeat_state, offset, &state);
+    }
+
+    if (ISNONZERO_STATE(AND_STATE(state, zmask))) {
+        return NFA_ZOMBIE_ALWAYS_YES;
+    }
+
+    return NFA_ZOMBIE_NO;
+}
+
+#undef TESTEOD_FN
+#undef INITIAL_FN
+#undef TOP_FN
+#undef TOPN_FN
+#undef REPORTCURRENT_FN
+#undef COMPRESS_FN
+#undef EXPAND_FN
+#undef COMPRESS_REPEATS_FN
+#undef EXPAND_REPEATS_FN
+#undef PROCESS_ACCEPTS_FN
+#undef PROCESS_ACCEPTS_NOSQUASH_FN
+#undef GET_NFA_REPEAT_INFO_FN
+#undef RUN_ACCEL_FN
+#undef RUN_EXCEPTIONS_FN
+#undef REV_STREAM_FN
+#undef LOOP_NOACCEL_FN
+#undef STREAM_FN
+#undef STREAMCB_FN
+#undef STREAMFIRST_FN
+#undef STREAMSILENT_FN
+#undef CONTEXT_T
+#undef EXCEPTION_T
+#undef AND_STATE
+#undef ANDNOT_STATE
+#undef OR_STATE
+#undef LSHIFT_STATE
+#undef TESTBIT_STATE
+#undef CLEARBIT_STATE
+#undef ZERO_STATE
+#undef ISNONZERO_STATE
+#undef ISZERO_STATE
+#undef NOTEQ_STATE
+#undef DIFFRICH_STATE
+#undef INLINE_ATTR_INT
+#undef IMPL_NFA_T
+#undef SQUASH_UNTUG_BR_FN
+#undef ACCEL_MASK
+#undef ACCEL_AND_FRIENDS_MASK
+#undef EXCEPTION_MASK
+#undef LIMEX_API_ROOT
diff --git a/regex/nfa/limex_shuffle.h b/regex/nfa/limex_shuffle.h
new file mode 100644
index 000000000..365d47296
--- /dev/null
+++ b/regex/nfa/limex_shuffle.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Naive dynamic shuffles.
+ *
+ * These are written with the assumption that the provided masks are sparsely
+ * populated and never contain more than 32 on bits. Other implementations will
+ * be faster and actually correct if these assumptions don't hold true.
+ */
+
+#ifndef LIMEX_SHUFFLE_H
+#define LIMEX_SHUFFLE_H
+
+#include "ue2common.h"
+#include "util/arch.h"
+#include "util/bitutils.h"
+#include "util/simd_utils.h"
+
+static really_inline
+u32 packedExtract128(m128 s, const m128 permute, const m128 compare) {
+    m128 shuffled = pshufb_m128(s, permute);
+    m128 compared = and128(shuffled, compare);
+    u16 rv = ~movemask128(eq128(compared, shuffled));
+    return (u32)rv;
+}
+
+#if defined(HAVE_AVX2)
+static really_inline
+u32 packedExtract256(m256 s, const m256 permute, const m256 compare) {
+    // vpshufb doesn't cross lanes, so this is a bit of a cheat
+    m256 shuffled = pshufb_m256(s, permute);
+    m256 compared = and256(shuffled, compare);
+    u32 rv = ~movemask256(eq256(compared, shuffled));
+    // stitch the lane-wise results back together
+    return (u32)((rv >> 16) | (rv & 0xffffU));
+}
+#endif // AVX2
+
+#if defined(HAVE_AVX512)
+static really_inline
+u32 packedExtract512(m512 s, const m512 permute, const m512 compare) {
+    // vpshufb doesn't cross lanes, so this is a bit of a cheat
+    m512 shuffled = pshufb_m512(s, permute);
+    m512 compared = and512(shuffled, compare);
+    u64a rv = ~eq512mask(compared, shuffled);
+    // stitch the lane-wise results back together
+    rv = rv >> 32 | rv;
+    return (u32)(((rv >> 16) | rv) & 0xffffU);
+}
+#endif // AVX512
+
+#endif // LIMEX_SHUFFLE_H
diff --git a/regex/nfa/limex_simd128.c b/regex/nfa/limex_simd128.c
new file mode 100644
index 000000000..c5f2b33e3
--- /dev/null
+++ b/regex/nfa/limex_simd128.c
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief LimEx NFA: 128-bit SIMD runtime implementations.
+ */
+
+//#define DEBUG_INPUT
+//#define DEBUG_EXCEPTIONS
+
+#include "limex.h"
+
+#include "accel.h"
+#include "limex_internal.h"
+#include "nfa_internal.h"
+#include "ue2common.h"
+#include "util/bitutils.h"
+#include "util/simd_utils.h"
+
+// Common code
+#define STATE_ON_STACK
+#define ESTATE_ON_STACK
+
+#include "limex_runtime.h"
+
+#define SIZE          128
+#define STATE_T       m128
+#define ENG_STATE_T   m128
+#define LOAD_FROM_ENG load_m128
+
+#include "limex_exceptional.h"
+
+#include "limex_state_impl.h"
+
+#define INLINE_ATTR really_inline
+#include "limex_common_impl.h"
+
+#include "limex_runtime_impl.h"
diff --git a/regex/nfa/limex_simd256.c b/regex/nfa/limex_simd256.c
new file mode 100644
index 000000000..cc2329081
--- /dev/null
+++ b/regex/nfa/limex_simd256.c
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief LimEx NFA: 256-bit SIMD runtime implementations.
+ */
+
+//#define DEBUG_INPUT
+//#define DEBUG_EXCEPTIONS
+
+#include "limex.h"
+
+#include "accel.h"
+#include "limex_internal.h"
+#include "nfa_internal.h"
+#include "ue2common.h"
+#include "util/bitutils.h"
+#include "util/simd_utils.h"
+
+// Common code
+#include "limex_runtime.h"
+
+#define SIZE          256
+#define STATE_T       m256
+#define ENG_STATE_T   m256
+#define LOAD_FROM_ENG load_m256
+
+#include "limex_exceptional.h"
+
+#include "limex_state_impl.h"
+
+#define INLINE_ATTR really_inline
+#include "limex_common_impl.h"
+
+#include "limex_runtime_impl.h"
diff --git a/regex/nfa/limex_simd384.c b/regex/nfa/limex_simd384.c
new file mode 100644
index 000000000..7e596e48b
--- /dev/null
+++ b/regex/nfa/limex_simd384.c
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief LimEx NFA: 384-bit SIMD runtime implementations.
+ */
+
+//#define DEBUG_INPUT
+//#define DEBUG_EXCEPTIONS
+
+#include "limex.h"
+
+#include "accel.h"
+#include "limex_internal.h"
+#include "nfa_internal.h"
+#include "ue2common.h"
+#include "util/bitutils.h"
+#include "util/simd_utils.h"
+
+// Common code
+#include "limex_runtime.h"
+
+#define SIZE          384
+#define STATE_T       m384
+#define ENG_STATE_T   m384
+#define LOAD_FROM_ENG load_m384
+
+#include "limex_exceptional.h"
+
+#include "limex_state_impl.h"
+
+#define INLINE_ATTR really_inline
+#include "limex_common_impl.h"
+
+#include "limex_runtime_impl.h"
diff --git a/regex/nfa/limex_simd512.c b/regex/nfa/limex_simd512.c
new file mode 100644
index 000000000..f779f335d
--- /dev/null
+++ b/regex/nfa/limex_simd512.c
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief LimEx NFA: 512-bit SIMD runtime implementations.
+ */
+
+//#define DEBUG_INPUT
+//#define DEBUG_EXCEPTIONS
+
+#include "limex.h"
+
+#include "accel.h"
+#include "limex_internal.h"
+#include "nfa_internal.h"
+#include "ue2common.h"
+#include "util/bitutils.h"
+#include "util/simd_utils.h"
+
+// Common code
+#include "limex_runtime.h"
+
+#define SIZE          512
+#define STATE_T       m512
+#define ENG_STATE_T   m512
+#define LOAD_FROM_ENG load_m512
+
+#include "limex_exceptional.h"
+
+#include "limex_state_impl.h"
+
+#define INLINE_ATTR really_inline
+#include "limex_common_impl.h"
+
+#include "limex_runtime_impl.h"
diff --git a/regex/nfa/limex_state_impl.h b/regex/nfa/limex_state_impl.h
new file mode 100644
index 000000000..732874047
--- /dev/null
+++ b/regex/nfa/limex_state_impl.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief NFA stream state handling.
+ */
+
+#include "util/join.h"
+#include "util/partial_store.h"
+#include "util/state_compress.h"
+#ifndef __KERNEL__
+#include <string.h>
+#else
+#include <linux/string.h>
+#endif
+
+#if !defined(SIZE) || !defined(STATE_T) || !defined(LOAD_FROM_ENG)
+#  error Must define SIZE, STATE_T, LOAD_FROM_ENG in includer.
+#endif
+
+#define IMPL_NFA_T          JOIN(struct LimExNFA, SIZE)
+#define COMMON_T            JOIN(NFACommon, SIZE)
+#define REACHMASK_FN        JOIN(moNfaReachMask, SIZE)
+#define COMPRESS_FN         JOIN(moNfaCompressState, SIZE)
+#define EXPAND_FN           JOIN(moNfaExpandState, SIZE)
+#define COMPRESSED_STORE_FN JOIN(store_compressed_, STATE_T)
+#define COMPRESSED_LOAD_FN  JOIN(load_compressed_, STATE_T)
+#define PARTIAL_STORE_FN    JOIN(partial_store_, STATE_T)
+#define PARTIAL_LOAD_FN     JOIN(partial_load_, STATE_T)
+#define OR_STATE            JOIN(or_, STATE_T)
+#define AND_STATE           JOIN(and_, STATE_T)
+#define ISZERO_STATE        JOIN(isZero_, STATE_T)
+
+static really_inline
+const ENG_STATE_T *get_reach_table(const IMPL_NFA_T *limex) {
+    const ENG_STATE_T *reach
+        = (const ENG_STATE_T *)((const char *)limex + sizeof(*limex));
+    assert(ISALIGNED_N(reach, alignof(ENG_STATE_T)));
+    return reach;
+}
+
+static really_inline
+STATE_T REACHMASK_FN(const IMPL_NFA_T *limex, const u8 key) {
+    const ENG_STATE_T *reach = get_reach_table(limex);
+    return LOAD_FROM_ENG(&reach[limex->reachMap[key]]);
+}
+
+static really_inline
+void COMPRESS_FN(const IMPL_NFA_T *limex, u8 *dest, const STATE_T *src,
+                 u8 key) {
+    assert(ISALIGNED_N(src, alignof(STATE_T)));
+    STATE_T a_src = *src;
+
+    DEBUG_PRINTF("compress state: %p -> %p\n", src, dest);
+
+    if (!(limex->flags & LIMEX_FLAG_COMPRESS_STATE)) {
+        // No key-based compression, just a partial store.
+        DEBUG_PRINTF("store state into %u bytes\n", limex->stateSize);
+        PARTIAL_STORE_FN(dest, a_src, limex->stateSize);
+    } else {
+        DEBUG_PRINTF("compress state, key=%hhx\n", key);
+
+        STATE_T reachmask = REACHMASK_FN(limex, key);
+
+        // Masked compression means that we mask off the initDs states and
+        // provide a shortcut for the all-zeroes case. Note that these must be
+        // switched on in the EXPAND call below.
+        if (limex->flags & LIMEX_FLAG_COMPRESS_MASKED) {
+            STATE_T s = AND_STATE(LOAD_FROM_ENG(&limex->compressMask), a_src);
+            if (ISZERO_STATE(s)) {
+                DEBUG_PRINTF("after compression mask, all states are zero\n");
+                memset(dest, 0, limex->stateSize);
+                return;
+            }
+
+            STATE_T mask = AND_STATE(LOAD_FROM_ENG(&limex->compressMask),
+                                     reachmask);
+            COMPRESSED_STORE_FN(dest, &s, &mask, limex->stateSize);
+        } else {
+            COMPRESSED_STORE_FN(dest, src, &reachmask, limex->stateSize);
+        }
+    }
+}
+
+static really_inline
+void EXPAND_FN(const IMPL_NFA_T *limex, STATE_T *dest, const u8 *src, u8 key) {
+    assert(ISALIGNED_N(dest, alignof(STATE_T)));
+    DEBUG_PRINTF("expand state: %p -> %p\n", src, dest);
+
+    if (!(limex->flags & LIMEX_FLAG_COMPRESS_STATE)) {
+        // No key-based compression, just a partial load.
+        DEBUG_PRINTF("load state from %u bytes\n", limex->stateSize);
+        *dest = PARTIAL_LOAD_FN(src, limex->stateSize);
+    } else {
+        DEBUG_PRINTF("expand state, key=%hhx\n", key);
+        STATE_T reachmask = REACHMASK_FN(limex, key);
+
+        if (limex->flags & LIMEX_FLAG_COMPRESS_MASKED) {
+            STATE_T mask = AND_STATE(LOAD_FROM_ENG(&limex->compressMask),
+                                     reachmask);
+            COMPRESSED_LOAD_FN(dest, src, &mask, limex->stateSize);
+            *dest = OR_STATE(LOAD_FROM_ENG(&limex->initDS), *dest);
+        } else {
+            COMPRESSED_LOAD_FN(dest, src, &reachmask, limex->stateSize);
+        }
+    }
+}
+
+#undef IMPL_NFA_T
+#undef COMMON_T
+#undef REACHMASK_FN
+#undef COMPRESS_FN
+#undef EXPAND_FN
+#undef COMPRESSED_STORE_FN
+#undef COMPRESSED_LOAD_FN
+#undef PARTIAL_STORE_FN
+#undef PARTIAL_LOAD_FN
+#undef OR_STATE
+#undef AND_STATE
+#undef ISZERO_STATE
diff --git a/regex/nfa/mcclellan.c b/regex/nfa/mcclellan.c
new file mode 100644
index 000000000..71f71e327
--- /dev/null
+++ b/regex/nfa/mcclellan.c
@@ -0,0 +1,1350 @@
+/*
+ * Copyright (c) 2015-2018, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "mcclellan.h"
+
+#include "accel.h"
+#include "mcclellan_internal.h"
+#include "nfa_api.h"
+#include "nfa_api_queue.h"
+#include "nfa_internal.h"
+#include "util/bitutils.h"
+#include "util/compare.h"
+#include "util/simd_utils.h"
+#include "ue2common.h"
+
+#include "mcclellan_common_impl.h"
+
+static really_inline
+char doComplexReport(NfaCallback cb, void *ctxt, const struct mcclellan *m,
+                     u32 s, u64a loc, char eod, u32 *cached_accept_state,
+                     u32 *cached_accept_id) {
+    DEBUG_PRINTF("reporting state = %u, loc=%llu, eod %hhu\n",
+                 s & STATE_MASK, loc, eod);
+
+    if (!eod && s == *cached_accept_state) {
+        if (cb(0, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+
+        return MO_CONTINUE_MATCHING; /* continue execution */
+    }
+
+    const struct mstate_aux *aux = get_aux(m, s);
+    size_t offset = eod ? aux->accept_eod : aux->accept;
+
+    assert(offset);
+    const struct report_list *rl
+        = (const void *)((const char *)m + offset - sizeof(struct NFA));
+    assert(ISALIGNED(rl));
+
+    DEBUG_PRINTF("report list size %u\n", rl->count);
+    u32 count = rl->count;
+
+    if (!eod && count == 1) {
+        *cached_accept_state = s;
+        *cached_accept_id = rl->report[0];
+
+        DEBUG_PRINTF("reporting %u\n", rl->report[0]);
+        if (cb(0, loc, rl->report[0], ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+
+        return MO_CONTINUE_MATCHING; /* continue execution */
+    }
+
+    for (u32 i = 0; i < count; i++) {
+        DEBUG_PRINTF("reporting %u\n", rl->report[i]);
+        if (cb(0, loc, rl->report[i], ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+    }
+
+    return MO_CONTINUE_MATCHING; /* continue execution */
+}
+
+static really_inline
+const u8 *run_mcclellan_accel(const struct mcclellan *m,
+                              const struct mstate_aux *aux, u32 s,
+                              const u8 **min_accel_offset,
+                              const u8 *c, const u8 *c_end) {
+    DEBUG_PRINTF("skipping\n");
+    u32 accel_offset = aux[s].accel_offset;
+
+    assert(aux[s].accel_offset);
+    assert(accel_offset >= m->aux_offset);
+    assert(!m->sherman_offset || accel_offset < m->sherman_offset);
+
+    const union AccelAux *aaux = (const void *)((const char *)m + accel_offset);
+    const u8 *c2 = run_accel(aaux, c, c_end);
+
+    if (c2 < *min_accel_offset + BAD_ACCEL_DIST) {
+        *min_accel_offset = c2 + BIG_ACCEL_PENALTY;
+    } else {
+        *min_accel_offset = c2 + SMALL_ACCEL_PENALTY;
+    }
+
+    if (*min_accel_offset >= c_end - ACCEL_MIN_LEN) {
+        *min_accel_offset = c_end;
+    }
+
+    DEBUG_PRINTF("advanced %zd, next accel chance in %zd/%zd\n",
+                 c2 - c, *min_accel_offset - c2, c_end - c2);
+
+    return c2;
+}
+
+static really_inline
+u32 doNormal16(const struct mcclellan *m, const u8 **c_inout, const u8 *end,
+               u32 s, char do_accel, enum MatchMode mode) {
+    const u8 *c = *c_inout;
+
+    const u16 *succ_table
+        = (const u16 *)((const char *)m + sizeof(struct mcclellan));
+    assert(ISALIGNED_N(succ_table, 2));
+    u32 sherman_base = m->sherman_limit;
+    const char *sherman_base_offset
+        = (const char *)m - sizeof(struct NFA) + m->sherman_offset;
+    u32 as = m->alphaShift;
+
+    s &= STATE_MASK;
+
+    while (c < end && s) {
+        u8 cprime = m->remap[*c];
+        DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx (s=%u)\n", *c,
+                     ourisprint(*c) ? *c : '?', cprime, s);
+        if (s < sherman_base) {
+            DEBUG_PRINTF("doing normal\n");
+            assert(s < m->state_count);
+            s = succ_table[(s << as) + cprime];
+        } else {
+            const char *sherman_state
+                = findShermanState(m, sherman_base_offset, sherman_base, s);
+            DEBUG_PRINTF("doing sherman (%u)\n", s);
+            s = doSherman16(sherman_state, cprime, succ_table, as);
+        }
+
+        DEBUG_PRINTF("s: %u (%u)\n", s, s & STATE_MASK);
+        c++;
+
+        if (do_accel && (s & ACCEL_FLAG)) {
+            break;
+        }
+        if (mode != NO_MATCHES && (s & ACCEPT_FLAG)) {
+            break;
+        }
+
+        s &= STATE_MASK;
+    }
+
+    *c_inout = c;
+    return s;
+}
+
+static really_inline
+u32 doNormalWide16(const struct mcclellan *m, const u8 **c_inout,
+                   const u8 *end, u32 s, char *qstate, u16 *offset,
+                   char do_accel, enum MatchMode mode) {
+    const u8 *c = *c_inout;
+
+    u32 wide_limit = m->wide_limit;
+    const char *wide_base
+        = (const char *)m - sizeof(struct NFA) + m->wide_offset;
+
+    const u16 *succ_table
+        = (const u16 *)((const char *)m + sizeof(struct mcclellan));
+    assert(ISALIGNED_N(succ_table, 2));
+    u32 sherman_base = m->sherman_limit;
+    const char *sherman_base_offset
+        = (const char *)m - sizeof(struct NFA) + m->sherman_offset;
+    u32 as = m->alphaShift;
+
+    s &= STATE_MASK;
+
+    while (c < end && s) {
+        u8 cprime = m->remap[*c];
+        DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx (s=%u) &c: %p\n", *c,
+                     ourisprint(*c) ? *c : '?', cprime, s, c);
+
+        if (unlikely(s >= wide_limit)) {
+            const char *wide_entry
+                = findWideEntry16(m, wide_base, wide_limit, s);
+            DEBUG_PRINTF("doing wide head (%u)\n", s);
+            s = doWide16(wide_entry, &c, end, m->remap, (u16 *)&s, qstate,
+                         offset);
+        } else if (s >= sherman_base) {
+            const char *sherman_state
+                = findShermanState(m, sherman_base_offset, sherman_base, s);
+            DEBUG_PRINTF("doing sherman (%u)\n", s);
+            s = doSherman16(sherman_state, cprime, succ_table, as);
+        } else {
+            DEBUG_PRINTF("doing normal\n");
+            s = succ_table[(s << as) + cprime];
+        }
+
+        DEBUG_PRINTF("s: %u (%u)\n", s, s & STATE_MASK);
+        c++;
+
+        if (do_accel && (s & ACCEL_FLAG)) {
+            break;
+        }
+        if (mode != NO_MATCHES && (s & ACCEPT_FLAG)) {
+            break;
+        }
+
+        s &= STATE_MASK;
+    }
+
+    *c_inout = c;
+    return s;
+}
+
+static really_inline
+char mcclellanExec16_i(const struct mcclellan *m, u32 *state, char *qstate,
+                       const u8 *buf, size_t len, u64a offAdj, NfaCallback cb,
+                       void *ctxt, char single, const u8 **c_final,
+                       enum MatchMode mode) {
+    assert(ISALIGNED_N(state, 2));
+    if (!len) {
+        if (mode == STOP_AT_MATCH) {
+            *c_final = buf;
+        }
+        return MO_ALIVE;
+    }
+
+    u32 s = *state;
+    u16 offset = 0;
+    const u8 *c = buf;
+    const u8 *c_end = buf + len;
+    const struct mstate_aux *aux
+        = (const struct mstate_aux *)((const char *)m + m->aux_offset
+                                      - sizeof(struct NFA));
+
+    s &= STATE_MASK;
+
+    u32 cached_accept_id = 0;
+    u32 cached_accept_state = 0;
+
+    DEBUG_PRINTF("s: %u, len %zu\n", s, len);
+
+    const u8 *min_accel_offset = c;
+    if (!m->has_accel || len < ACCEL_MIN_LEN) {
+        min_accel_offset = c_end;
+        goto without_accel;
+    }
+
+    goto with_accel;
+
+without_accel:
+    do {
+        assert(c < min_accel_offset);
+        if (!s) {
+            goto exit;
+        }
+
+        if (unlikely(m->has_wide)) {
+            s = doNormalWide16(m, &c, min_accel_offset, s, qstate, &offset, 0,
+                               mode);
+        } else {
+            s = doNormal16(m, &c, min_accel_offset, s, 0, mode);
+        }
+
+        if (mode != NO_MATCHES && (s & ACCEPT_FLAG)) {
+            if (mode == STOP_AT_MATCH) {
+                *state = s & STATE_MASK;
+                *c_final = c - 1;
+                return MO_MATCHES_PENDING;
+            }
+
+            u64a loc = (c - 1) - buf + offAdj + 1;
+
+            if (single) {
+                DEBUG_PRINTF("reporting %u\n", m->arb_report);
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                    return MO_DEAD; /* termination requested */
+                }
+            } else if (doComplexReport(cb, ctxt, m, s & STATE_MASK, loc, 0,
+                                       &cached_accept_state, &cached_accept_id)
+                       == MO_HALT_MATCHING) {
+                return MO_DEAD;
+            }
+        }
+
+        assert(c <= min_accel_offset);
+    } while (c < min_accel_offset);
+
+    s &= STATE_MASK;
+
+    if (c == c_end) {
+        goto exit;
+    } else {
+        goto with_accel;
+    }
+
+with_accel:
+    do {
+        assert(c < c_end);
+        if (!s) {
+            goto exit;
+        }
+
+        if (s & ACCEL_FLAG) {
+            DEBUG_PRINTF("skipping\n");
+            s &= STATE_MASK;
+            c = run_mcclellan_accel(m, aux, s, &min_accel_offset, c, c_end);
+            if (c == c_end) {
+                goto exit;
+            } else {
+                goto without_accel;
+            }
+        }
+
+        if (unlikely(m->has_wide)) {
+            s = doNormalWide16(m, &c, c_end, s, qstate, &offset, 1, mode);
+        } else {
+            s = doNormal16(m, &c, c_end, s, 1, mode);
+        }
+
+        if (mode != NO_MATCHES && (s & ACCEPT_FLAG)) {
+            if (mode == STOP_AT_MATCH) {
+                *state = s & STATE_MASK;
+                *c_final = c - 1;
+                return MO_MATCHES_PENDING;
+            }
+
+            u64a loc = (c - 1) - buf + offAdj + 1;
+
+            if (single) {
+                DEBUG_PRINTF("reporting %u\n", m->arb_report);
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                    return MO_DEAD; /* termination requested */
+                }
+            } else if (doComplexReport(cb, ctxt, m, s & STATE_MASK, loc, 0,
+                                       &cached_accept_state, &cached_accept_id)
+                       == MO_HALT_MATCHING) {
+                return MO_DEAD;
+            }
+        }
+
+        assert(c <= c_end);
+    } while (c < c_end);
+
+exit:
+    s &= STATE_MASK;
+
+    if (mode == STOP_AT_MATCH) {
+        *c_final = c_end;
+    }
+    *state = s;
+
+    return MO_ALIVE;
+}
+
+static never_inline
+char mcclellanExec16_i_cb(const struct mcclellan *m, u32 *state, char *qstate,
+                          const u8 *buf, size_t len, u64a offAdj,
+                          NfaCallback cb, void *ctxt, char single,
+                          const u8 **final_point) {
+    return mcclellanExec16_i(m, state, qstate, buf, len, offAdj, cb, ctxt,
+                             single, final_point, CALLBACK_OUTPUT);
+}
+
+static never_inline
+char mcclellanExec16_i_sam(const struct mcclellan *m, u32 *state, char *qstate,
+                           const u8 *buf, size_t len, u64a offAdj,
+                           NfaCallback cb, void *ctxt, char single,
+                           const u8 **final_point) {
+    return mcclellanExec16_i(m, state, qstate, buf, len, offAdj, cb, ctxt,
+                             single, final_point, STOP_AT_MATCH);
+}
+
+static never_inline
+char mcclellanExec16_i_nm(const struct mcclellan *m, u32 *state, char *qstate,
+                          const u8 *buf, size_t len, u64a offAdj,
+                          NfaCallback cb, void *ctxt, char single,
+                          const u8 **final_point) {
+    return mcclellanExec16_i(m, state, qstate, buf, len, offAdj, cb, ctxt,
+                             single, final_point, NO_MATCHES);
+}
+
+static really_inline
+char mcclellanExec16_i_ni(const struct mcclellan *m, u32 *state, char *qstate,
+                          const u8 *buf, size_t len, u64a offAdj,
+                          NfaCallback cb, void *ctxt, char single,
+                          const u8 **final_point, enum MatchMode mode) {
+    if (mode == CALLBACK_OUTPUT) {
+        return mcclellanExec16_i_cb(m, state, qstate, buf, len, offAdj, cb,
+                                    ctxt, single, final_point);
+    } else if (mode == STOP_AT_MATCH) {
+        return mcclellanExec16_i_sam(m, state, qstate, buf, len, offAdj, cb,
+                                     ctxt, single, final_point);
+    } else {
+        assert(mode == NO_MATCHES);
+        return mcclellanExec16_i_nm(m, state, qstate, buf, len, offAdj, cb,
+                                    ctxt, single, final_point);
+    }
+}
+
+static really_inline
+u32 doNormal8(const struct mcclellan *m, const u8 **c_inout, const u8 *end,
+              u32 s, char do_accel, enum MatchMode mode) {
+    const u8 *c = *c_inout;
+    u32 accel_limit = m->accel_limit_8;
+    u32 accept_limit = m->accept_limit_8;
+
+    const u32 as = m->alphaShift;
+    const u8 *succ_table = (const u8 *)((const char *)m
+                                        + sizeof(struct mcclellan));
+    while (c < end && s) {
+        u8 cprime = m->remap[*c];
+        DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx\n", *c,
+                     ourisprint(*c) ? *c : '?', cprime);
+        s = succ_table[(s << as) + cprime];
+
+        DEBUG_PRINTF("s: %u\n", s);
+        c++;
+        if (do_accel) {
+            if (s >= accel_limit) {
+                break;
+            }
+        } else {
+            if (mode != NO_MATCHES && s >= accept_limit) {
+                break;
+            }
+        }
+    }
+    *c_inout = c;
+    return s;
+}
+
+static really_inline
+char mcclellanExec8_i(const struct mcclellan *m, u32 *state, const u8 *buf,
+                      size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                      char single, const u8 **c_final, enum MatchMode mode) {
+    if (!len) {
+        if (mode == STOP_AT_MATCH) {
+            *c_final = buf;
+        }
+        return MO_ALIVE;
+    }
+    u32 s = *state;
+    const u8 *c = buf;
+    const u8 *c_end = buf + len;
+
+    const struct mstate_aux *aux
+        = (const struct mstate_aux *)((const char *)m + m->aux_offset
+                                      - sizeof(struct NFA));
+    u32 accept_limit = m->accept_limit_8;
+
+    u32 cached_accept_id = 0;
+    u32 cached_accept_state = 0;
+
+    DEBUG_PRINTF("accel %hu, accept %u\n", m->accel_limit_8, accept_limit);
+
+    DEBUG_PRINTF("s: %u, len %zu\n", s, len);
+
+    const u8 *min_accel_offset = c;
+    if (!m->has_accel || len < ACCEL_MIN_LEN) {
+        min_accel_offset = c_end;
+        goto without_accel;
+    }
+
+    goto with_accel;
+
+without_accel:
+    do {
+        assert(c < min_accel_offset);
+        if (!s) {
+            goto exit;
+        }
+
+        s = doNormal8(m, &c, min_accel_offset, s, 0, mode);
+
+        if (mode != NO_MATCHES && s >= accept_limit) {
+            if (mode == STOP_AT_MATCH) {
+                DEBUG_PRINTF("match - pausing\n");
+                *state = s;
+                *c_final = c - 1;
+                return MO_MATCHES_PENDING;
+            }
+
+            u64a loc = (c - 1) - buf + offAdj + 1;
+            if (single) {
+                DEBUG_PRINTF("reporting %u\n", m->arb_report);
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                    return MO_DEAD;
+                }
+            } else if (doComplexReport(cb, ctxt, m, s, loc, 0,
+                                       &cached_accept_state, &cached_accept_id)
+                       == MO_HALT_MATCHING) {
+                return MO_DEAD;
+            }
+        }
+
+        assert(c <= min_accel_offset);
+    } while (c < min_accel_offset);
+
+    if (c == c_end) {
+        goto exit;
+    }
+
+with_accel:
+    do {
+        u32 accel_limit = m->accel_limit_8;
+        assert(c < c_end);
+
+        if (!s) {
+            goto exit;
+        }
+
+        if (s >= accel_limit && aux[s].accel_offset) {
+            c = run_mcclellan_accel(m, aux, s, &min_accel_offset, c, c_end);
+            if (c == c_end) {
+                goto exit;
+            } else {
+                goto without_accel;
+            }
+        }
+        s = doNormal8(m, &c, c_end, s, 1, mode);
+
+        if (mode != NO_MATCHES && s >= accept_limit) {
+            if (mode == STOP_AT_MATCH) {
+                DEBUG_PRINTF("match - pausing\n");
+                *state = s;
+                *c_final = c - 1;
+                return MO_MATCHES_PENDING;
+            }
+
+            u64a loc = (c - 1) - buf + offAdj + 1;
+            if (single) {
+                DEBUG_PRINTF("reporting %u\n", m->arb_report);
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                    return MO_DEAD;
+                }
+            } else if (doComplexReport(cb, ctxt, m, s, loc, 0,
+                                       &cached_accept_state, &cached_accept_id)
+                       == MO_HALT_MATCHING) {
+                return MO_DEAD;
+            }
+        }
+
+        assert(c <= c_end);
+    } while (c < c_end);
+
+exit:
+    *state = s;
+    if (mode == STOP_AT_MATCH) {
+        *c_final = c_end;
+    }
+    return MO_ALIVE;
+}
+
+static never_inline
+char mcclellanExec8_i_cb(const struct mcclellan *m, u32 *state, const u8 *buf,
+                         size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                         char single, const u8 **final_point) {
+    return mcclellanExec8_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                          final_point, CALLBACK_OUTPUT);
+}
+
+static never_inline
+char mcclellanExec8_i_sam(const struct mcclellan *m, u32 *state, const u8 *buf,
+                          size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                          char single, const u8 **final_point) {
+    return mcclellanExec8_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                          final_point, STOP_AT_MATCH);
+}
+
+static never_inline
+char mcclellanExec8_i_nm(const struct mcclellan *m, u32 *state, const u8 *buf,
+                         size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                         char single, const u8 **final_point) {
+    return mcclellanExec8_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                          final_point, NO_MATCHES);
+}
+
+static really_inline
+char mcclellanExec8_i_ni(const struct mcclellan *m, u32 *state, const u8 *buf,
+                         size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                         char single, const u8 **final_point,
+                         enum MatchMode mode) {
+    if (mode == CALLBACK_OUTPUT) {
+        return mcclellanExec8_i_cb(m, state, buf, len, offAdj, cb, ctxt, single,
+                                   final_point);
+    } else if (mode == STOP_AT_MATCH) {
+        return mcclellanExec8_i_sam(m, state, buf, len, offAdj, cb, ctxt,
+                                    single, final_point);
+    } else {
+        assert(mode == NO_MATCHES);
+        return mcclellanExec8_i_nm(m, state, buf, len, offAdj, cb, ctxt, single,
+                                   final_point);
+    }
+}
+
+static really_inline
+char mcclellanCheckEOD(const struct NFA *nfa, u32 s, u64a offset,
+                       NfaCallback cb, void *ctxt) {
+    const struct mcclellan *m = getImplNfa(nfa);
+    const struct mstate_aux *aux = get_aux(m, s);
+
+    if (m->has_wide == 1 && s >= m->wide_limit) {
+        return MO_CONTINUE_MATCHING;
+    }
+
+    if (!aux->accept_eod) {
+        return MO_CONTINUE_MATCHING;
+    }
+    return doComplexReport(cb, ctxt, m, s, offset, 1, NULL, NULL);
+}
+
+static really_inline
+char nfaExecMcClellan16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
+                            const u8 *hend, NfaCallback cb, void *context,
+                            struct mq *q, char single, s64a end,
+                            enum MatchMode mode) {
+    assert(n->type == MCCLELLAN_NFA_16);
+    const struct mcclellan *m = getImplNfa(n);
+    s64a sp;
+
+    assert(ISALIGNED_N(q->state, 2));
+    u32 s = *(u16 *)q->state;
+
+    if (q->report_current) {
+        assert(s);
+        assert(get_aux(m, s)->accept);
+
+        int rv;
+        if (single) {
+            DEBUG_PRINTF("reporting %u\n", m->arb_report);
+            rv = cb(0, q_cur_offset(q), m->arb_report, context);
+        } else {
+            u32 cached_accept_id = 0;
+            u32 cached_accept_state = 0;
+
+            rv = doComplexReport(cb, context, m, s, q_cur_offset(q), 0,
+                                 &cached_accept_state, &cached_accept_id);
+        }
+
+        q->report_current = 0;
+
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+    }
+
+    sp = q_cur_loc(q);
+    q->cur++;
+
+    const u8 *cur_buf = sp < 0 ? hend : buffer;
+
+    assert(q->cur);
+    if (mode != NO_MATCHES && q->items[q->cur - 1].location > end) {
+        DEBUG_PRINTF("this is as far as we go\n");
+        q->cur--;
+        q->items[q->cur].type = MQE_START;
+        q->items[q->cur].location = end;
+        *(u16 *)q->state = s;
+        return MO_ALIVE;
+    }
+
+    while (1) {
+        assert(q->cur < q->end);
+        s64a ep = q->items[q->cur].location;
+        if (mode != NO_MATCHES) {
+            ep = MIN(ep, end);
+        }
+
+        assert(ep >= sp);
+
+        s64a local_ep = ep;
+        if (sp < 0) {
+            local_ep = MIN(0, ep);
+        }
+
+        /* do main buffer region */
+        const u8 *final_look;
+        char rv = mcclellanExec16_i_ni(m, &s, q->state, cur_buf + sp,
+                                       local_ep - sp, offset + sp, cb, context,
+                                       single, &final_look, mode);
+        if (rv == MO_DEAD) {
+            *(u16 *)q->state = 0;
+            return MO_DEAD;
+        }
+        if (mode == STOP_AT_MATCH && rv == MO_MATCHES_PENDING) {
+            DEBUG_PRINTF("this is as far as we go\n");
+            DEBUG_PRINTF("state %u final_look %zd\n", s, final_look - cur_buf);
+
+            assert(q->cur);
+            assert(final_look != cur_buf + local_ep);
+
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = final_look - cur_buf + 1; /* due to
+                                                                   * early -1 */
+            *(u16 *)q->state = s;
+            return MO_MATCHES_PENDING;
+        }
+
+        assert(rv == MO_ALIVE);
+        assert(q->cur);
+        if (mode != NO_MATCHES && q->items[q->cur].location > end) {
+            DEBUG_PRINTF("this is as far as we go\n");
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = end;
+            *(u16 *)q->state = s;
+            return MO_ALIVE;
+        }
+
+        sp = local_ep;
+
+        if (sp == 0) {
+            cur_buf = buffer;
+        }
+
+        if (sp != ep) {
+            continue;
+        }
+
+        switch (q->items[q->cur].type) {
+        case MQE_TOP:
+            assert(sp + offset || !s);
+            if (sp + offset == 0) {
+                s = m->start_anchored;
+                break;
+            }
+            s = mcclellanEnableStarts(m, s);
+            break;
+        case MQE_END:
+            *(u16 *)q->state = s;
+            q->cur++;
+            return s ? MO_ALIVE : MO_DEAD;
+        default:
+            assert(!"invalid queue event");
+        }
+
+        q->cur++;
+    }
+}
+
+static really_inline
+char nfaExecMcClellan16_Bi(const struct NFA *n, u64a offset, const u8 *buffer,
+                           size_t length, NfaCallback cb, void *context,
+                           char single) {
+    assert(n->type == MCCLELLAN_NFA_16);
+    const struct mcclellan *m = getImplNfa(n);
+    u32 s = m->start_anchored;
+
+    if (mcclellanExec16_i(m, &s, NULL, buffer, length, offset, cb, context,
+                          single, NULL, CALLBACK_OUTPUT)
+        == MO_DEAD) {
+        return s ? MO_ALIVE : MO_DEAD;
+    }
+
+    if (m->has_wide == 1 && s >= m->wide_limit) {
+        return MO_ALIVE;
+    }
+
+    const struct mstate_aux *aux = get_aux(m, s);
+
+    if (aux->accept_eod) {
+        doComplexReport(cb, context, m, s, offset + length, 1, NULL, NULL);
+    }
+
+    return MO_ALIVE;
+}
+
+static really_inline
+char nfaExecMcClellan8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
+                           const u8 *hend, NfaCallback cb, void *context,
+                           struct mq *q, char single, s64a end,
+                           enum MatchMode mode) {
+    assert(n->type == MCCLELLAN_NFA_8);
+    const struct mcclellan *m = getImplNfa(n);
+    s64a sp;
+
+    u32 s = *(u8 *)q->state;
+
+    if (q->report_current) {
+        assert(s);
+        assert(s >= m->accept_limit_8);
+
+        int rv;
+        if (single) {
+            DEBUG_PRINTF("reporting %u\n", m->arb_report);
+            rv = cb(0, q_cur_offset(q), m->arb_report, context);
+        } else {
+            u32 cached_accept_id = 0;
+            u32 cached_accept_state = 0;
+
+            rv = doComplexReport(cb, context, m, s, q_cur_offset(q), 0,
+                                 &cached_accept_state, &cached_accept_id);
+        }
+
+        q->report_current = 0;
+
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+    }
+
+    sp = q_cur_loc(q);
+    q->cur++;
+
+    const u8 *cur_buf = sp < 0 ? hend : buffer;
+
+    if (mode != NO_MATCHES && q->items[q->cur - 1].location > end) {
+        DEBUG_PRINTF("this is as far as we go\n");
+        q->cur--;
+        q->items[q->cur].type = MQE_START;
+        q->items[q->cur].location = end;
+        *(u8 *)q->state = s;
+        return MO_ALIVE;
+    }
+
+    while (1) {
+        DEBUG_PRINTF("%s @ %llu\n", q->items[q->cur].type == MQE_TOP ? "TOP" :
+                     q->items[q->cur].type == MQE_END ? "END" : "???",
+                     q->items[q->cur].location + offset);
+        assert(q->cur < q->end);
+        s64a ep = q->items[q->cur].location;
+        if (mode != NO_MATCHES) {
+            ep = MIN(ep, end);
+        }
+
+        assert(ep >= sp);
+
+        s64a local_ep = ep;
+        if (sp < 0) {
+            local_ep = MIN(0, ep);
+        }
+
+        const u8 *final_look;
+        char rv = mcclellanExec8_i_ni(m, &s, cur_buf + sp, local_ep - sp,
+                                     offset + sp, cb, context, single,
+                                     &final_look, mode);
+
+        if (rv == MO_HALT_MATCHING) {
+            *(u8 *)q->state = 0;
+            return MO_DEAD;
+        }
+        if (mode == STOP_AT_MATCH && rv == MO_MATCHES_PENDING) {
+            DEBUG_PRINTF("this is as far as we go\n");
+            DEBUG_PRINTF("state %u final_look %zd\n", s, final_look - cur_buf);
+
+            assert(q->cur);
+            assert(final_look != cur_buf + local_ep);
+
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = final_look - cur_buf + 1; /* due to
+                                                                   * early -1 */
+            *(u8 *)q->state = s;
+            return MO_MATCHES_PENDING;
+        }
+
+        assert(rv == MO_ALIVE);
+        assert(q->cur);
+        if (mode != NO_MATCHES && q->items[q->cur].location > end) {
+            DEBUG_PRINTF("this is as far as we go\n");
+            assert(q->cur);
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = end;
+            *(u8 *)q->state = s;
+            return MO_ALIVE;
+        }
+
+        sp = local_ep;
+
+        if (sp == 0) {
+            cur_buf = buffer;
+        }
+
+        if (sp != ep) {
+            continue;
+        }
+
+        switch (q->items[q->cur].type) {
+        case MQE_TOP:
+            assert(sp + offset || !s);
+            if (sp + offset == 0) {
+                s = (u8)m->start_anchored;
+                break;
+            }
+            s = mcclellanEnableStarts(m, s);
+            break;
+        case MQE_END:
+            *(u8 *)q->state = s;
+            q->cur++;
+            return s ? MO_ALIVE : MO_DEAD;
+        default:
+            assert(!"invalid queue event");
+        }
+
+        q->cur++;
+    }
+}
+
+static really_inline
+char nfaExecMcClellan8_Bi(const struct NFA *n, u64a offset, const u8 *buffer,
+                          size_t length, NfaCallback cb, void *context,
+                          char single) {
+    assert(n->type == MCCLELLAN_NFA_8);
+    const struct mcclellan *m = getImplNfa(n);
+    u32 s = m->start_anchored;
+
+    if (mcclellanExec8_i(m, &s, buffer, length, offset, cb, context, single,
+                         NULL, CALLBACK_OUTPUT)
+        == MO_DEAD) {
+        return MO_DEAD;
+    }
+
+    const struct mstate_aux *aux = get_aux(m, s);
+
+    if (aux->accept_eod) {
+        doComplexReport(cb, context, m, s, offset + length, 1, NULL, NULL);
+    }
+
+    return s ? MO_ALIVE : MO_DEAD;
+}
+
+char nfaExecMcClellan8_B(const struct NFA *n, u64a offset, const u8 *buffer,
+                         size_t length, NfaCallback cb, void *context) {
+    assert(n->type == MCCLELLAN_NFA_8);
+    const struct mcclellan *m = getImplNfa(n);
+
+    if (m->flags & MCCLELLAN_FLAG_SINGLE) {
+        return nfaExecMcClellan8_Bi(n, offset, buffer, length, cb, context, 1);
+    } else {
+        return nfaExecMcClellan8_Bi(n, offset, buffer, length, cb, context, 0);
+    }
+}
+
+char nfaExecMcClellan8_Q(const struct NFA *n, struct mq *q, s64a end) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCCLELLAN_NFA_8);
+    const struct mcclellan *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    return nfaExecMcClellan8_Q2i(n, offset, buffer, hend, cb, context, q,
+                                 m->flags & MCCLELLAN_FLAG_SINGLE, end,
+                                 CALLBACK_OUTPUT);
+}
+
+char nfaExecMcClellan16_B(const struct NFA *n, u64a offset, const u8 *buffer,
+                          size_t length, NfaCallback cb, void *context) {
+    assert(n->type == MCCLELLAN_NFA_16);
+    const struct mcclellan *m = getImplNfa(n);
+
+    if (m->flags & MCCLELLAN_FLAG_SINGLE) {
+        return nfaExecMcClellan16_Bi(n, offset, buffer, length, cb, context, 1);
+    } else {
+        return nfaExecMcClellan16_Bi(n, offset, buffer, length, cb, context, 0);
+    }
+}
+
+char nfaExecMcClellan16_Q(const struct NFA *n, struct mq *q, s64a end) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCCLELLAN_NFA_16);
+    const struct mcclellan *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    return nfaExecMcClellan16_Q2i(n, offset, buffer, hend, cb, context, q,
+                                  m->flags & MCCLELLAN_FLAG_SINGLE, end,
+                                  CALLBACK_OUTPUT);
+}
+
+char nfaExecMcClellan8_reportCurrent(const struct NFA *n, struct mq *q) {
+    const struct mcclellan *m = getImplNfa(n);
+    NfaCallback cb = q->cb;
+    void *ctxt = q->context;
+    u32 s = *(u8 *)q->state;
+    u8 single = m->flags & MCCLELLAN_FLAG_SINGLE;
+    u64a offset = q_cur_offset(q);
+    assert(q_cur_type(q) == MQE_START);
+    assert(s);
+
+    if (s >= m->accept_limit_8) {
+        if (single) {
+            DEBUG_PRINTF("reporting %u\n", m->arb_report);
+            cb(0, offset, m->arb_report, ctxt);
+        } else {
+            u32 cached_accept_id = 0;
+            u32 cached_accept_state = 0;
+
+            doComplexReport(cb, ctxt, m, s, offset, 0, &cached_accept_state,
+                            &cached_accept_id);
+        }
+    }
+
+    return 0;
+}
+
+char nfaExecMcClellan16_reportCurrent(const struct NFA *n, struct mq *q) {
+    const struct mcclellan *m = getImplNfa(n);
+    NfaCallback cb = q->cb;
+    void *ctxt = q->context;
+    u32 s = *(u16 *)q->state;
+    const struct mstate_aux *aux = get_aux(m, s);
+    u8 single = m->flags & MCCLELLAN_FLAG_SINGLE;
+    u64a offset = q_cur_offset(q);
+    assert(q_cur_type(q) == MQE_START);
+    DEBUG_PRINTF("state %u\n", s);
+    assert(s);
+
+    if (aux->accept) {
+        if (single) {
+            DEBUG_PRINTF("reporting %u\n", m->arb_report);
+            cb(0, offset, m->arb_report, ctxt);
+        } else {
+            u32 cached_accept_id = 0;
+            u32 cached_accept_state = 0;
+
+            doComplexReport(cb, ctxt, m, s, offset, 0, &cached_accept_state,
+                            &cached_accept_id);
+        }
+    }
+
+    return 0;
+}
+
+static
+char mcclellanHasAccept(const struct mcclellan *m, const struct mstate_aux *aux,
+                        ReportID report) {
+    assert(m && aux);
+
+    if (!aux->accept) {
+        return 0;
+    }
+
+    const struct report_list *rl = (const struct report_list *)
+            ((const char *)m + aux->accept - sizeof(struct NFA));
+    assert(ISALIGNED_N(rl, 4));
+
+    DEBUG_PRINTF("report list has %u entries\n", rl->count);
+
+    for (u32 i = 0; i < rl->count; i++) {
+        if (rl->report[i] == report) {
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
+char nfaExecMcClellan8_inAccept(const struct NFA *n, ReportID report,
+                                struct mq *q) {
+    assert(n && q);
+
+    const struct mcclellan *m = getImplNfa(n);
+    u8 s = *(u8 *)q->state;
+    DEBUG_PRINTF("checking accepts for %hhu\n", s);
+    if (s < m->accept_limit_8) {
+        return 0;
+    }
+
+    return mcclellanHasAccept(m, get_aux(m, s), report);
+}
+
+char nfaExecMcClellan8_inAnyAccept(const struct NFA *n, struct mq *q) {
+    assert(n && q);
+
+    const struct mcclellan *m = getImplNfa(n);
+    u8 s = *(u8 *)q->state;
+    DEBUG_PRINTF("checking accepts for %hhu\n", s);
+    assert(s < m->accept_limit_8 || get_aux(m, s)->accept);
+
+    return s >= m->accept_limit_8;
+}
+
+char nfaExecMcClellan16_inAccept(const struct NFA *n, ReportID report,
+                                 struct mq *q) {
+    assert(n && q);
+
+    const struct mcclellan *m = getImplNfa(n);
+    u16 s = *(u16 *)q->state;
+    DEBUG_PRINTF("checking accepts for %hu\n", s);
+
+    return (m->has_wide == 1 && s >= m->wide_limit) ?
+                0 : mcclellanHasAccept(m, get_aux(m, s), report);
+}
+
+char nfaExecMcClellan16_inAnyAccept(const struct NFA *n, struct mq *q) {
+    assert(n && q);
+
+    const struct mcclellan *m = getImplNfa(n);
+    u16 s = *(u16 *)q->state;
+    DEBUG_PRINTF("checking accepts for %hu\n", s);
+
+    return (m->has_wide == 1 && s >= m->wide_limit) ?
+                0 : !!get_aux(m, s)->accept;
+}
+
+char nfaExecMcClellan8_Q2(const struct NFA *n, struct mq *q, s64a end) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCCLELLAN_NFA_8);
+    const struct mcclellan *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    return nfaExecMcClellan8_Q2i(n, offset, buffer, hend, cb, context, q,
+                                 m->flags & MCCLELLAN_FLAG_SINGLE, end,
+                                 STOP_AT_MATCH);
+}
+
+char nfaExecMcClellan16_Q2(const struct NFA *n, struct mq *q, s64a end) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCCLELLAN_NFA_16);
+    const struct mcclellan *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    return nfaExecMcClellan16_Q2i(n, offset, buffer, hend, cb, context, q,
+                                  m->flags & MCCLELLAN_FLAG_SINGLE, end,
+                                  STOP_AT_MATCH);
+}
+
+char nfaExecMcClellan8_QR(const struct NFA *n, struct mq *q, ReportID report) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCCLELLAN_NFA_8);
+    const struct mcclellan *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    char rv = nfaExecMcClellan8_Q2i(n, offset, buffer, hend, cb, context, q,
+                                m->flags & MCCLELLAN_FLAG_SINGLE, 0 /* end */,
+                                NO_MATCHES);
+    if (rv && nfaExecMcClellan8_inAccept(n, report, q)) {
+        return MO_MATCHES_PENDING;
+    } else {
+        return rv;
+    }
+}
+
+char nfaExecMcClellan16_QR(const struct NFA *n, struct mq *q, ReportID report) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCCLELLAN_NFA_16);
+    const struct mcclellan *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    char rv = nfaExecMcClellan16_Q2i(n, offset, buffer, hend, cb, context, q,
+                                     m->flags & MCCLELLAN_FLAG_SINGLE,
+                                     0 /* end */, NO_MATCHES);
+
+    if (rv && nfaExecMcClellan16_inAccept(n, report, q)) {
+        return MO_MATCHES_PENDING;
+    } else {
+        return rv;
+    }
+}
+
+char nfaExecMcClellan8_initCompressedState(const struct NFA *nfa, u64a offset,
+                                           void *state, UNUSED u8 key) {
+    const struct mcclellan *m = getImplNfa(nfa);
+    u8 s = offset ? m->start_floating : m->start_anchored;
+    if (s) {
+        *(u8 *)state = s;
+        return 1;
+    }
+    return 0;
+}
+
+char nfaExecMcClellan16_initCompressedState(const struct NFA *nfa, u64a offset,
+                                            void *state, UNUSED u8 key) {
+    const struct mcclellan *m = getImplNfa(nfa);
+    u16 s = offset ? m->start_floating : m->start_anchored;
+
+    // new byte
+    if (m->has_wide) {
+        unaligned_store_u16((u16 *)state + 1, 0);
+    }
+
+    if (s) {
+        unaligned_store_u16(state, s);
+        return 1;
+    }
+    return 0;
+}
+
+void nfaExecMcClellan8_SimpStream(const struct NFA *nfa, char *state,
+                                  const u8 *buf, char top, size_t start_off,
+                                  size_t len, NfaCallback cb, void *ctxt) {
+    const struct mcclellan *m = getImplNfa(nfa);
+
+    u32 s = top ? m->start_anchored : *(u8 *)state;
+
+    if (m->flags & MCCLELLAN_FLAG_SINGLE) {
+        mcclellanExec8_i(m, &s, buf + start_off, len - start_off,
+                         start_off, cb, ctxt, 1, NULL, CALLBACK_OUTPUT);
+    } else {
+        mcclellanExec8_i(m, &s, buf + start_off, len - start_off,
+                         start_off, cb, ctxt, 0, NULL, CALLBACK_OUTPUT);
+    }
+
+    *(u8 *)state = s;
+}
+
+void nfaExecMcClellan16_SimpStream(const struct NFA *nfa, char *state,
+                                   const u8 *buf, char top, size_t start_off,
+                                   size_t len, NfaCallback cb, void *ctxt) {
+    const struct mcclellan *m = getImplNfa(nfa);
+    u32 s;
+
+    if (top) {
+        s = m->start_anchored;
+
+        // new byte
+        if (m->has_wide) {
+            unaligned_store_u16((u16 *)state + 1, 0);
+        }
+    } else {
+        s = unaligned_load_u16(state);
+    }
+
+    if (m->flags & MCCLELLAN_FLAG_SINGLE) {
+        mcclellanExec16_i(m, &s, state, buf + start_off, len - start_off,
+                          start_off, cb, ctxt, 1, NULL, CALLBACK_OUTPUT);
+    } else {
+        mcclellanExec16_i(m, &s, state, buf + start_off, len - start_off,
+                          start_off, cb, ctxt, 0, NULL, CALLBACK_OUTPUT);
+    }
+
+    unaligned_store_u16(state, s);
+}
+
+char nfaExecMcClellan8_testEOD(const struct NFA *nfa, const char *state,
+                               UNUSED const char *streamState, u64a offset,
+                               NfaCallback callback, void *context) {
+    return mcclellanCheckEOD(nfa, *(const u8 *)state, offset, callback,
+                             context);
+}
+
+char nfaExecMcClellan16_testEOD(const struct NFA *nfa, const char *state,
+                                UNUSED const char *streamState, u64a offset,
+                                NfaCallback callback, void *context) {
+    assert(ISALIGNED_N(state, 2));
+    return mcclellanCheckEOD(nfa, *(const u16 *)state, offset, callback,
+                             context);
+}
+
+char nfaExecMcClellan8_queueInitState(UNUSED const struct NFA *nfa,
+                                      struct mq *q) {
+    assert(nfa->scratchStateSize == 1);
+    *(u8 *)q->state = 0;
+    return 0;
+}
+
+char nfaExecMcClellan16_queueInitState(UNUSED const struct NFA *nfa,
+                                       struct mq *q) {
+    const struct mcclellan *m = getImplNfa(nfa);
+    assert(m->has_wide == 1 ? nfa->scratchStateSize == 4
+                            : nfa->scratchStateSize == 2);
+    assert(ISALIGNED_N(q->state, 2));
+    *(u16 *)q->state = 0;
+
+    // new byte
+    if (m->has_wide) {
+        unaligned_store_u16((u16 *)q->state + 1, 0);
+    }
+    return 0;
+}
+
+char nfaExecMcClellan8_queueCompressState(UNUSED const struct NFA *nfa,
+                                          const struct mq *q, UNUSED s64a loc) {
+    void *dest = q->streamState;
+    const void *src = q->state;
+    assert(nfa->scratchStateSize == 1);
+    assert(nfa->streamStateSize == 1);
+    *(u8 *)dest = *(const u8 *)src;
+    return 0;
+}
+
+char nfaExecMcClellan8_expandState(UNUSED const struct NFA *nfa, void *dest,
+                                   const void *src, UNUSED u64a offset,
+                                   UNUSED u8 key) {
+    assert(nfa->scratchStateSize == 1);
+    assert(nfa->streamStateSize == 1);
+    *(u8 *)dest = *(const u8 *)src;
+    return 0;
+}
+
+char nfaExecMcClellan16_queueCompressState(UNUSED const struct NFA *nfa,
+                                           const struct mq *q,
+                                           UNUSED s64a loc) {
+    const struct mcclellan *m = getImplNfa(nfa);
+    void *dest = q->streamState;
+    const void *src = q->state;
+    assert(m->has_wide == 1 ? nfa->scratchStateSize == 4
+                            : nfa->scratchStateSize == 2);
+    assert(m->has_wide == 1 ? nfa->streamStateSize == 4
+                            : nfa->streamStateSize == 2);
+
+    assert(ISALIGNED_N(src, 2));
+    unaligned_store_u16(dest, *(const u16 *)(src));
+
+    // new byte
+    if (m->has_wide) {
+        unaligned_store_u16((u16 *)dest + 1, *((const u16 *)src + 1));
+    }
+    return 0;
+}
+
+char nfaExecMcClellan16_expandState(UNUSED const struct NFA *nfa, void *dest,
+                                    const void *src, UNUSED u64a offset,
+                                    UNUSED u8 key) {
+    const struct mcclellan *m = getImplNfa(nfa);
+    assert(m->has_wide == 1 ? nfa->scratchStateSize == 4
+                            : nfa->scratchStateSize == 2);
+    assert(m->has_wide == 1 ? nfa->streamStateSize == 4
+                            : nfa->streamStateSize == 2);
+
+    assert(ISALIGNED_N(dest, 2));
+    *(u16 *)dest = unaligned_load_u16(src);
+
+    // new byte
+    if (m->has_wide) {
+        *((u16 *)dest + 1) =  unaligned_load_u16((const u16 *)src + 1);
+    }
+    return 0;
+}
diff --git a/regex/nfa/mcclellan.h b/regex/nfa/mcclellan.h
new file mode 100644
index 000000000..9c6b3eecb
--- /dev/null
+++ b/regex/nfa/mcclellan.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef MCCLELLAN_H
+#define MCCLELLAN_H
+
+#include "callback.h"
+#include "ue2common.h"
+
+struct mq;
+struct NFA;
+
+// 8-bit McClellan
+
+char nfaExecMcClellan8_testEOD(const struct NFA *nfa, const char *state,
+                               const char *streamState, u64a offset,
+                               NfaCallback callback, void *context);
+char nfaExecMcClellan8_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecMcClellan8_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecMcClellan8_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecMcClellan8_reportCurrent(const struct NFA *n, struct mq *q);
+char nfaExecMcClellan8_inAccept(const struct NFA *n, ReportID report,
+                                struct mq *q);
+char nfaExecMcClellan8_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecMcClellan8_queueInitState(const struct NFA *n, struct mq *q);
+char nfaExecMcClellan8_initCompressedState(const struct NFA *n, u64a offset,
+                                           void *state, u8 key);
+char nfaExecMcClellan8_queueCompressState(const struct NFA *nfa,
+                                          const struct mq *q, s64a loc);
+char nfaExecMcClellan8_expandState(const struct NFA *nfa, void *dest,
+                                   const void *src, u64a offset, u8 key);
+
+#define nfaExecMcClellan8_B_Reverse NFA_API_NO_IMPL
+#define nfaExecMcClellan8_zombie_status NFA_API_ZOMBIE_NO_IMPL
+
+// 16-bit McClellan
+
+char nfaExecMcClellan16_testEOD(const struct NFA *nfa, const char *state,
+                                const char *streamState, u64a offset,
+                                NfaCallback callback, void *context);
+char nfaExecMcClellan16_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecMcClellan16_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecMcClellan16_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecMcClellan16_reportCurrent(const struct NFA *n, struct mq *q);
+char nfaExecMcClellan16_inAccept(const struct NFA *n, ReportID report,
+                                 struct mq *q);
+char nfaExecMcClellan16_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecMcClellan16_queueInitState(const struct NFA *n, struct mq *q);
+char nfaExecMcClellan16_initCompressedState(const struct NFA *n, u64a offset,
+                                            void *state, u8 key);
+char nfaExecMcClellan16_queueCompressState(const struct NFA *nfa,
+                                           const struct mq *q, s64a loc);
+char nfaExecMcClellan16_expandState(const struct NFA *nfa, void *dest,
+                                    const void *src, u64a offset, u8 key);
+
+#define nfaExecMcClellan16_B_Reverse NFA_API_NO_IMPL
+#define nfaExecMcClellan16_zombie_status NFA_API_ZOMBIE_NO_IMPL
+
+/**
+ * Simple streaming mode calls:
+ * - always uses the anchored start state regardless if top is set regardless of
+ * start_off
+ * - never checks eod
+ */
+void nfaExecMcClellan8_SimpStream(const struct NFA *nfa, char *state,
+                                  const u8 *buf, char top, size_t start_off,
+                                  size_t len, NfaCallback cb, void *ctxt);
+
+void nfaExecMcClellan16_SimpStream(const struct NFA *nfa, char *state,
+                                   const u8 *buf, char top, size_t start_off,
+                                   size_t len, NfaCallback cb, void *ctxt);
+
+/**
+ * Simple block mode calls:
+ * - always uses the anchored start state regardless of initial start
+ */
+
+char nfaExecMcClellan8_B(const struct NFA *n, u64a offset, const u8 *buffer,
+                         size_t length, NfaCallback cb, void *context);
+
+char nfaExecMcClellan16_B(const struct NFA *n, u64a offset, const u8 *buffer,
+                          size_t length, NfaCallback cb, void *context);
+
+#endif
diff --git a/regex/nfa/mcclellan_common_impl.h b/regex/nfa/mcclellan_common_impl.h
new file mode 100644
index 000000000..7b0e7f48c
--- /dev/null
+++ b/regex/nfa/mcclellan_common_impl.h
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2015-2018, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+enum MatchMode {
+    CALLBACK_OUTPUT,
+    STOP_AT_MATCH,
+    NO_MATCHES
+};
+
+static really_inline
+const struct mstate_aux *get_aux(const struct mcclellan *m, u32 s) {
+    const char *nfa = (const char *)m - sizeof(struct NFA);
+    const struct mstate_aux *aux
+        = s + (const struct mstate_aux *)(nfa + m->aux_offset);
+
+    assert(ISALIGNED(aux));
+    return aux;
+}
+
+static really_inline
+u32 mcclellanEnableStarts(const struct mcclellan *m, u32 s) {
+    const struct mstate_aux *aux = get_aux(m, s);
+
+    DEBUG_PRINTF("enabling starts %u->%hu\n", s, aux->top);
+    return aux->top;
+}
+
+static really_inline
+u32 doSherman16(const char *sherman_state, u8 cprime, const u16 *succ_table,
+                u32 as) {
+    assert(ISALIGNED_N(sherman_state, 16));
+
+    u8 len = *(const u8 *)(sherman_state + SHERMAN_LEN_OFFSET);
+
+    if (len) {
+        m128 ss_char = load128(sherman_state);
+        m128 cur_char = set16x8(cprime);
+
+        u32 z = movemask128(eq128(ss_char, cur_char));
+
+        /* remove header cruft: type 1, len 1, daddy 2*/
+        z &= ~0xf;
+        z &= (1U << (len + 4)) - 1;
+
+        if (z) {
+            u32 i = ctz32(z & ~0xf) - 4;
+
+            u32 s_out = unaligned_load_u16((const u8 *)sherman_state
+                                           + SHERMAN_STATES_OFFSET(len)
+                                           + sizeof(u16) * i);
+            DEBUG_PRINTF("found sherman match at %u/%u for c'=%hhu s=%u\n", i,
+                         len, cprime, s_out);
+            return s_out;
+        }
+    }
+
+    u32 daddy = *(const u16 *)(sherman_state + SHERMAN_DADDY_OFFSET);
+    return succ_table[(daddy << as) + cprime];
+}
+
+static really_inline
+u16 doWide16(const char *wide_entry, const u8 **c_inout, const u8 *end,
+             const u8 *remap, const u16 *s, char *qstate, u16 *offset) {
+    // Internal relative offset after the last visit of the wide state.
+    if (qstate != NULL) { // stream mode
+        *offset = unaligned_load_u16((const u16 *)(qstate + 2));
+    }
+
+    u8 successful = 0;
+    const u8 *c = *c_inout;
+    u32 len_c = end - c;
+
+    u16 width = *(const u16 *)(wide_entry + WIDE_WIDTH_OFFSET);
+    assert(width >= 8);
+    const u8 *symbols = (const u8 *)(wide_entry + WIDE_SYMBOL_OFFSET16);
+    const u16 *trans = (const u16 *)(wide_entry +
+                                     WIDE_TRANSITION_OFFSET16(width));
+
+    assert(*offset < width);
+    u16 len_w = width - *offset;
+    const u8 *sym = symbols + *offset;
+
+    char tmp[16];
+    u16 pos = 0;
+
+    if (*offset == 0 && remap[*c] != *sym) {
+        goto normal;
+    }
+
+    // both in (16, +oo).
+    while (len_w >= 16 && len_c >= 16) {
+        m128 str_w = loadu128(sym);
+        for (size_t i = 0; i < 16; i++) {
+            tmp[i] = remap[*(c + i)];
+        }
+        m128 str_c = loadu128(tmp);
+
+        u32 z = movemask128(eq128(str_w, str_c));
+        pos = ctz32(~z);
+        assert(pos <= 16);
+
+        if (pos < 16) {
+            goto normal;
+        }
+
+        sym += 16;
+        c += 16;
+        len_w -= 16;
+        len_c -= 16;
+    }
+
+    pos = 0;
+    // at least one in (0, 16).
+    u32 loadLength_w = MIN(len_w, 16);
+    u32 loadLength_c = MIN(len_c, 16);
+    m128 str_w = loadbytes128(sym, loadLength_w);
+    for (size_t i = 0; i < loadLength_c; i++) {
+        tmp[i] = remap[*(c + i)];
+    }
+    m128 str_c = loadbytes128(tmp, loadLength_c);
+
+    u32 z = movemask128(eq128(str_w, str_c));
+    pos = ctz32(~z);
+
+    pos = MIN(pos, MIN(loadLength_w, loadLength_c));
+
+    if (loadLength_w <= loadLength_c) {
+        assert(pos <= loadLength_w);
+        // successful matching.
+        if (pos == loadLength_w) {
+            c -= 1;
+            successful = 1;
+        }
+        // failure, do nothing.
+    } else {
+        assert(pos <= loadLength_c);
+        // successful partial matching.
+        if (pos == loadLength_c) {
+            c -= 1;
+            goto partial;
+        }
+        // failure, do nothing.
+    }
+
+normal:
+    *offset = 0;
+    if (qstate != NULL) {
+        // Internal relative offset.
+        unaligned_store_u16(qstate + 2, *offset);
+    }
+    c += pos;
+    *c_inout = c;
+    return successful ? *trans : *(trans + 1 + remap[*c]);
+
+partial:
+    *offset = sym - symbols + pos;
+    if (qstate != NULL) {
+        // Internal relative offset.
+        unaligned_store_u16(qstate + 2, *offset);
+    }
+    c += pos;
+    *c_inout = c;
+    return *s;
+}
diff --git a/regex/nfa/mcclellan_internal.h b/regex/nfa/mcclellan_internal.h
new file mode 100644
index 000000000..482fdb1bc
--- /dev/null
+++ b/regex/nfa/mcclellan_internal.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2015-2018, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef MCCLELLAN_INTERNAL_H
+#define MCCLELLAN_INTERNAL_H
+
+#include "nfa_internal.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#define ACCEPT_FLAG 0x8000
+#define ACCEL_FLAG  0x4000
+#define STATE_MASK  0x3fff
+
+#define SHERMAN_STATE 1
+
+#define SHERMAN_TYPE_OFFSET            0
+#define SHERMAN_FIXED_SIZE            32
+
+#define SHERMAN_LEN_OFFSET             1
+#define SHERMAN_DADDY_OFFSET           2
+#define SHERMAN_CHARS_OFFSET           4
+#define SHERMAN_STATES_OFFSET(sso_len) (4 + (sso_len))
+
+#define WIDE_STATE 2
+#define WIDE_ENTRY_OFFSET8(weo_pos) (2 + (weo_pos))
+#define WIDE_ENTRY_OFFSET16(weo_pos) (4 + (weo_pos))
+
+#define WIDE_WIDTH_OFFSET 0
+#define WIDE_SYMBOL_OFFSET8 1
+#define WIDE_TRANSITION_OFFSET8(wto_width) (1 + (wto_width))
+#define WIDE_SYMBOL_OFFSET16 2
+#define WIDE_TRANSITION_OFFSET16(wto_width) (2 + ROUNDUP_N(wto_width, 2))
+
+struct report_list {
+    u32 count;
+    ReportID report[];
+};
+
+struct mstate_aux {
+    u32 accept;
+    u32 accept_eod;
+    u16 top;
+    u32 accel_offset; /* relative to start of struct mcclellan; 0 if no accel */
+};
+
+#define MCCLELLAN_FLAG_SINGLE 1  /**< we raise only single accept id */
+
+struct mcclellan {
+    u16 state_count; /**< total number of states */
+    u32 length; /**< length of dfa in bytes */
+    u16 start_anchored; /**< anchored start state */
+    u16 start_floating; /**< floating start state */
+    u32 aux_offset; /**< offset of the aux structures relative to the start of
+                     *  the nfa structure */
+    u32 sherman_offset; /**< offset of array of sherman state offsets the
+                         * state_info structures relative to the start of the
+                         * nfa structure */
+    u32 sherman_end; /**< offset of the end of the state_info structures
+                      *  relative to the start of the nfa structure */
+    u16 accel_limit_8; /**< 8 bit, lowest accelerable state */
+    u16 accept_limit_8; /**< 8 bit, lowest accept state */
+    u16 sherman_limit; /**< lowest sherman state */
+    u16 wide_limit; /**< 8/16 bit, lowest wide head state */
+    u8  alphaShift;
+    u8  flags;
+    u8  has_accel; /**< 1 iff there are any accel plans */
+    u8  has_wide; /**< 1 iff there exists any wide state */
+    u8  remap[256]; /**< remaps characters to a smaller alphabet */
+    ReportID arb_report; /**< one of the accepts that this dfa may raise */
+    u32 accel_offset; /**< offset of accel structures from start of McClellan */
+    u32 haig_offset; /**< reserved for use by Haig, relative to start of NFA */
+    u32 wide_offset; /**< offset of the wide state entries to the start of the
+                      * nfa structure */
+};
+
+static really_inline
+const char *findShermanState(UNUSED const struct mcclellan *m,
+                             const char *sherman_base_offset, u32 sherman_base,
+                             u32 s) {
+    const char *rv
+        = sherman_base_offset + SHERMAN_FIXED_SIZE * (s - sherman_base);
+    assert(rv < (const char *)m + m->length - sizeof(struct NFA));
+    UNUSED u8 type = *(const u8 *)(rv + SHERMAN_TYPE_OFFSET);
+    assert(type == SHERMAN_STATE);
+    return rv;
+}
+
+static really_inline
+char *findMutableShermanState(char *sherman_base_offset, u16 sherman_base,
+                              u32 s) {
+    return sherman_base_offset + SHERMAN_FIXED_SIZE * (s - sherman_base);
+}
+
+static really_inline
+const char *findWideEntry8(UNUSED const struct mcclellan *m,
+                           const char *wide_base, u32 wide_limit, u32 s) {
+    UNUSED u8 type = *(const u8 *)wide_base;
+    assert(type == WIDE_STATE);
+    const u32 entry_offset
+        = *(const u32 *)(wide_base
+        + WIDE_ENTRY_OFFSET8((s - wide_limit) * sizeof(u32)));
+
+    const char *rv = wide_base + entry_offset;
+    assert(rv < (const char *)m + m->length - sizeof(struct NFA));
+    return rv;
+}
+
+static really_inline
+const char *findWideEntry16(UNUSED const struct mcclellan *m,
+                            const char *wide_base, u32 wide_limit, u32 s) {
+    UNUSED u8 type = *(const u8 *)wide_base;
+    assert(type == WIDE_STATE);
+    const u32 entry_offset
+        = *(const u32 *)(wide_base
+        + WIDE_ENTRY_OFFSET16((s - wide_limit) * sizeof(u32)));
+
+    const char *rv = wide_base + entry_offset;
+    assert(rv < (const char *)m + m->length - sizeof(struct NFA));
+    return rv;
+}
+
+static really_inline
+char *findMutableWideEntry16(char *wide_base, u32 wide_limit, u32 s) {
+    u32 entry_offset
+        = *(const u32 *)(wide_base
+        + WIDE_ENTRY_OFFSET16((s - wide_limit) * sizeof(u32)));
+
+    return wide_base + entry_offset;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/regex/nfa/mcsheng.c b/regex/nfa/mcsheng.c
new file mode 100644
index 000000000..f86acedf5
--- /dev/null
+++ b/regex/nfa/mcsheng.c
@@ -0,0 +1,2742 @@
+/*
+ * Copyright (c) 2016-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "mcsheng.h"
+
+#include "accel.h"
+#include "mcsheng_internal.h"
+#include "nfa_api.h"
+#include "nfa_api_queue.h"
+#include "nfa_internal.h"
+#include "util/arch.h"
+#include "util/bitutils.h"
+#include "util/compare.h"
+#include "util/simd_utils.h"
+#include "ue2common.h"
+
+enum MatchMode {
+    CALLBACK_OUTPUT,
+    STOP_AT_MATCH,
+    NO_MATCHES
+};
+
+static really_inline
+const struct mstate_aux *get_aux(const struct mcsheng *m, u32 s) {
+    const char *nfa = (const char *)m - sizeof(struct NFA);
+    const struct mstate_aux *aux
+        = s + (const struct mstate_aux *)(nfa + m->aux_offset);
+
+    assert(ISALIGNED(aux));
+    return aux;
+}
+
+static really_inline
+u32 mcshengEnableStarts(const struct mcsheng *m, u32 s) {
+    const struct mstate_aux *aux = get_aux(m, s);
+
+    DEBUG_PRINTF("enabling starts %u->%hu\n", s, aux->top);
+    return aux->top;
+}
+
+static really_inline
+u32 doSherman16(const char *sherman_state, u8 cprime, const u16 *succ_table,
+                u32 as) {
+    assert(ISALIGNED_N(sherman_state, 16));
+
+    u8 len = *(const u8 *)(sherman_state + SHERMAN_LEN_OFFSET);
+
+    if (len) {
+        m128 ss_char = load128(sherman_state);
+        m128 cur_char = set16x8(cprime);
+
+        u32 z = movemask128(eq128(ss_char, cur_char));
+
+        /* remove header cruft: type 1, len 1, daddy 2*/
+        z &= ~0xf;
+        z &= (1U << (len + 4)) - 1;
+
+        if (z) {
+            u32 i = ctz32(z & ~0xf) - 4;
+
+            u32 s_out = unaligned_load_u16((const u8 *)sherman_state
+                                           + SHERMAN_STATES_OFFSET(len)
+                                           + sizeof(u16) * i);
+            DEBUG_PRINTF("found sherman match at %u/%u for c'=%hhu s=%u\n", i,
+                         len, cprime, s_out);
+            return s_out;
+        }
+    }
+
+    u32 daddy = *(const u16 *)(sherman_state + SHERMAN_DADDY_OFFSET);
+    return succ_table[(daddy << as) + cprime];
+}
+
+static really_inline
+char doComplexReport(NfaCallback cb, void *ctxt, const struct mcsheng *m,
+                     u32 s, u64a loc, char eod, u32 *cached_accept_state,
+                     u32 *cached_accept_id) {
+    DEBUG_PRINTF("reporting state = %u, loc=%llu, eod %hhu\n",
+                 s & STATE_MASK, loc, eod);
+
+    if (!eod && s == *cached_accept_state) {
+        if (cb(0, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+
+        return MO_CONTINUE_MATCHING; /* continue execution */
+    }
+
+    const struct mstate_aux *aux = get_aux(m, s);
+    size_t offset = eod ? aux->accept_eod : aux->accept;
+
+    assert(offset);
+    const struct report_list *rl
+        = (const void *)((const char *)m + offset - sizeof(struct NFA));
+    assert(ISALIGNED(rl));
+
+    DEBUG_PRINTF("report list size %u\n", rl->count);
+    u32 count = rl->count;
+
+    if (!eod && count == 1) {
+        *cached_accept_state = s;
+        *cached_accept_id = rl->report[0];
+
+        DEBUG_PRINTF("reporting %u\n", rl->report[0]);
+        if (cb(0, loc, rl->report[0], ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+
+        return MO_CONTINUE_MATCHING; /* continue execution */
+    }
+
+    for (u32 i = 0; i < count; i++) {
+        DEBUG_PRINTF("reporting %u\n", rl->report[i]);
+        if (cb(0, loc, rl->report[i], ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+    }
+
+    return MO_CONTINUE_MATCHING; /* continue execution */
+}
+
+#define SHENG_CHUNK 8
+
+static really_inline
+u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end,
+            const u8 *hard_c_end, u32 s_in, char do_accel) {
+    assert(s_in < m->sheng_end);
+    assert(s_in); /* should not already be dead */
+    assert(soft_c_end <= hard_c_end);
+    DEBUG_PRINTF("s_in = %u (adjusted %u)\n", s_in, s_in - 1);
+    m128 s = set16x8(s_in - 1);
+    const u8 *c = *c_inout;
+    const u8 *c_end = hard_c_end - SHENG_CHUNK + 1;
+    if (!do_accel) {
+        c_end = MIN(soft_c_end, hard_c_end - SHENG_CHUNK + 1);
+    }
+    const m128 *masks = m->sheng_masks;
+    u8 sheng_limit = m->sheng_end - 1; /* - 1: no dead state */
+    u8 sheng_stop_limit = do_accel ? m->sheng_accel_limit : sheng_limit;
+
+    /* When we use movd to get a u32 containing our state, it will have 4 lanes
+     * all duplicating the state. We can create versions of our limits with 4
+     * copies to directly compare against, this prevents us generating code to
+     * extract a single copy of the state from the u32 for checking. */
+    u32 sheng_stop_limit_x4 = sheng_stop_limit * 0x01010101;
+
+#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
+    u32 sheng_limit_x4 = sheng_limit * 0x01010101;
+    m128 simd_stop_limit = set4x32(sheng_stop_limit_x4);
+    m128 accel_delta = set16x8(sheng_limit - sheng_stop_limit);
+    DEBUG_PRINTF("end %hhu, accel %hu --> limit %hhu\n", sheng_limit,
+                 m->sheng_accel_limit, sheng_stop_limit);
+#endif
+
+#define SHENG_SINGLE_ITER do {                                             \
+        m128 shuffle_mask = masks[*(c++)];                                 \
+        s = pshufb_m128(shuffle_mask, s);                                  \
+        u32 s_gpr_x4 = movd(s); /* convert to u8 */                        \
+        DEBUG_PRINTF("c %hhu (%c) --> s %u\n", c[-1], c[-1], s_gpr_x4);    \
+        if (s_gpr_x4 >= sheng_stop_limit_x4) {                             \
+            s_gpr = s_gpr_x4;                                              \
+            goto exit;                                                     \
+        }                                                                  \
+    } while (0)
+
+    u8 s_gpr;
+    while (c < c_end) {
+#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
+        /* This version uses pext for efficiently bitbashing out scaled
+         * versions of the bytes to process from a u64a */
+
+        u64a data_bytes = unaligned_load_u64a(c);
+        u64a cc0 = pdep64(data_bytes, 0xff0); /* extract scaled low byte */
+        data_bytes &= ~0xffULL; /* clear low bits for scale space */
+        m128 shuffle_mask0 = load128((const char *)masks + cc0);
+        s = pshufb_m128(shuffle_mask0, s);
+        m128 s_max = s;
+        m128 s_max0 = s_max;
+        DEBUG_PRINTF("c %02llx --> s %u\n", cc0 >> 4, movd(s));
+
+#define SHENG_SINGLE_UNROLL_ITER(iter)                                  \
+        assert(iter);                                                   \
+        u64a cc##iter = pext64(data_bytes, mcsheng_pext_mask[iter]);    \
+        assert(cc##iter == (u64a)c[iter] << 4);                         \
+        m128 shuffle_mask##iter = load128((const char *)masks + cc##iter); \
+        s = pshufb_m128(shuffle_mask##iter, s);                         \
+        if (do_accel && iter == 7) {                                    \
+            /* in the final iteration we also have to check against accel */ \
+            m128 s_temp = sadd_u8_m128(s, accel_delta);                 \
+            s_max = max_u8_m128(s_max, s_temp);                         \
+        } else {                                                        \
+            s_max = max_u8_m128(s_max, s);                              \
+        }                                                               \
+        m128 s_max##iter = s_max;                                       \
+        DEBUG_PRINTF("c %02llx --> s %u max %u\n", cc##iter >> 4,       \
+                     movd(s), movd(s_max));
+
+        SHENG_SINGLE_UNROLL_ITER(1);
+
+        SHENG_SINGLE_UNROLL_ITER(2);
+        SHENG_SINGLE_UNROLL_ITER(3);
+
+        SHENG_SINGLE_UNROLL_ITER(4);
+        SHENG_SINGLE_UNROLL_ITER(5);
+
+        SHENG_SINGLE_UNROLL_ITER(6);
+        SHENG_SINGLE_UNROLL_ITER(7);
+
+        if (movd(s_max7) >= sheng_limit_x4) {
+            DEBUG_PRINTF("exit found\n");
+
+            /* Explicitly check the last byte as it is more likely as it also
+             * checks for acceleration. */
+            if (movd(s_max6) < sheng_limit_x4) {
+                c += SHENG_CHUNK;
+                s_gpr = movq(s);
+                assert(s_gpr >= sheng_stop_limit);
+                goto exit;
+            }
+
+            /* use shift-xor to create a register containing all of the max
+             * values */
+            m128 blended = rshift64_m128(s_max0, 56);
+            blended = xor128(blended, rshift64_m128(s_max1, 48));
+            blended = xor128(blended, rshift64_m128(s_max2, 40));
+            blended = xor128(blended, rshift64_m128(s_max3, 32));
+            blended = xor128(blended, rshift64_m128(s_max4, 24));
+            blended = xor128(blended, rshift64_m128(s_max5, 16));
+            blended = xor128(blended, rshift64_m128(s_max6, 8));
+            blended = xor128(blended, s);
+            blended = xor128(blended, rshift64_m128(blended, 8));
+            DEBUG_PRINTF("blended %016llx\n", movq(blended));
+
+            m128 final = min_u8_m128(blended, simd_stop_limit);
+            m128 cmp = sub_u8_m128(final, simd_stop_limit);
+            u64a stops = ~movemask128(cmp);
+            assert(stops);
+            u32 earliest = ctz32(stops);
+            DEBUG_PRINTF("stops %02llx, earliest %u\n", stops, earliest);
+            assert(earliest < 8);
+            c += earliest + 1;
+            s_gpr = movq(blended) >> (earliest * 8);
+            assert(s_gpr >= sheng_stop_limit);
+            goto exit;
+        } else {
+            c += SHENG_CHUNK;
+        }
+#else
+        SHENG_SINGLE_ITER;
+        SHENG_SINGLE_ITER;
+        SHENG_SINGLE_ITER;
+        SHENG_SINGLE_ITER;
+
+        SHENG_SINGLE_ITER;
+        SHENG_SINGLE_ITER;
+        SHENG_SINGLE_ITER;
+        SHENG_SINGLE_ITER;
+#endif
+    }
+
+    assert(c_end - c < SHENG_CHUNK);
+    if (c < soft_c_end) {
+        assert(soft_c_end - c < SHENG_CHUNK);
+        switch (soft_c_end - c) {
+        case 7:
+            SHENG_SINGLE_ITER;
+            FALLTHROUGH;
+        case 6:
+            SHENG_SINGLE_ITER;
+            FALLTHROUGH;
+        case 5:
+            SHENG_SINGLE_ITER;
+            FALLTHROUGH;
+        case 4:
+            SHENG_SINGLE_ITER;
+            FALLTHROUGH;
+        case 3:
+            SHENG_SINGLE_ITER;
+            FALLTHROUGH;
+        case 2:
+            SHENG_SINGLE_ITER;
+            FALLTHROUGH;
+        case 1:
+            SHENG_SINGLE_ITER;
+        }
+    }
+
+    assert(c >= soft_c_end);
+
+    s_gpr = movd(s);
+exit:
+    assert(c <= hard_c_end);
+    DEBUG_PRINTF("%zu from end; s %hhu\n", c_end - c, s_gpr);
+    assert(c >= soft_c_end || s_gpr >= sheng_stop_limit);
+    /* undo state adjustment to match mcclellan view */
+    if (s_gpr == sheng_limit) {
+        s_gpr = 0;
+    } else if (s_gpr < sheng_limit) {
+        s_gpr++;
+    }
+
+    *c_inout = c;
+    return s_gpr;
+}
+
+static really_inline
+const char *findShermanState(UNUSED const struct mcsheng *m,
+                             const char *sherman_base_offset, u32 sherman_base,
+                             u32 s) {
+    const char *rv
+        = sherman_base_offset + SHERMAN_FIXED_SIZE * (s - sherman_base);
+    assert(rv < (const char *)m + m->length - sizeof(struct NFA));
+    UNUSED u8 type = *(const u8 *)(rv + SHERMAN_TYPE_OFFSET);
+    assert(type == SHERMAN_STATE);
+    return rv;
+}
+
+static really_inline
+const u8 *run_mcsheng_accel(const struct mcsheng *m,
+                            const struct mstate_aux *aux, u32 s,
+                            const u8 **min_accel_offset,
+                            const u8 *c, const u8 *c_end) {
+    DEBUG_PRINTF("skipping\n");
+    u32 accel_offset = aux[s].accel_offset;
+
+    assert(aux[s].accel_offset);
+    assert(accel_offset >= m->aux_offset);
+    assert(!m->sherman_offset || accel_offset < m->sherman_offset);
+
+    const union AccelAux *aaux = (const void *)((const char *)m + accel_offset);
+    const u8 *c2 = run_accel(aaux, c, c_end);
+
+    if (c2 < *min_accel_offset + BAD_ACCEL_DIST) {
+        *min_accel_offset = c2 + BIG_ACCEL_PENALTY;
+    } else {
+        *min_accel_offset = c2 + SMALL_ACCEL_PENALTY;
+    }
+
+    if (*min_accel_offset >= c_end - ACCEL_MIN_LEN) {
+        *min_accel_offset = c_end;
+    }
+
+    DEBUG_PRINTF("advanced %zd, next accel chance in %zd/%zd\n",
+                 c2 - c, *min_accel_offset - c2, c_end - c2);
+
+    return c2;
+}
+
+static really_inline
+u32 doNormal16(const struct mcsheng *m, const u8 **c_inout, const u8 *end,
+               u32 s, char do_accel, enum MatchMode mode) {
+    const u8 *c = *c_inout;
+
+    const u16 *succ_table
+        = (const u16 *)((const char *)m + sizeof(struct mcsheng));
+    assert(ISALIGNED_N(succ_table, 2));
+    u32 sheng_end = m->sheng_end;
+    u32 sherman_base = m->sherman_limit;
+    const char *sherman_base_offset
+        = (const char *)m - sizeof(struct NFA) + m->sherman_offset;
+    u32 as = m->alphaShift;
+
+    /* Adjust start of succ table so we can index into using state id (rather
+     * than adjust to normal id). As we will not be processing states with low
+     * state ids, we will not be accessing data before the succ table. Note: due
+     * to the size of the sheng tables, the succ_table pointer will still be
+     * inside the engine.*/
+    succ_table -= sheng_end << as;
+
+    s &= STATE_MASK;
+
+    while (c < end && s >= sheng_end) {
+        u8 cprime = m->remap[*c];
+        DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx (s=%u)\n", *c,
+                     ourisprint(*c) ? *c : '?', cprime, s);
+        if (s < sherman_base) {
+            DEBUG_PRINTF("doing normal\n");
+            assert(s < m->state_count);
+            s = succ_table[(s << as) + cprime];
+        } else {
+            const char *sherman_state
+                = findShermanState(m, sherman_base_offset, sherman_base, s);
+            DEBUG_PRINTF("doing sherman (%u)\n", s);
+            s = doSherman16(sherman_state, cprime, succ_table, as);
+        }
+
+        DEBUG_PRINTF("s: %u (%u)\n", s, s & STATE_MASK);
+        c++;
+
+        if (do_accel && (s & ACCEL_FLAG)) {
+            break;
+        }
+        if (mode != NO_MATCHES && (s & ACCEPT_FLAG)) {
+            break;
+        }
+
+        s &= STATE_MASK;
+    }
+
+    *c_inout = c;
+    return s;
+}
+
+static really_inline
+char mcshengExec16_i(const struct mcsheng *m, u32 *state, const u8 *buf,
+                     size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                     char single, const u8 **c_final, enum MatchMode mode) {
+    assert(ISALIGNED_N(state, 2));
+    if (!len) {
+        if (mode == STOP_AT_MATCH) {
+            *c_final = buf;
+        }
+        return MO_ALIVE;
+    }
+
+    u32 s = *state;
+    const u8 *c = buf;
+    const u8 *c_end = buf + len;
+    const u8 sheng_end = m->sheng_end;
+    const struct mstate_aux *aux
+        = (const struct mstate_aux *)((const char *)m + m->aux_offset
+                                      - sizeof(struct NFA));
+
+    s &= STATE_MASK;
+
+    u32 cached_accept_id = 0;
+    u32 cached_accept_state = 0;
+
+    DEBUG_PRINTF("s: %u, len %zu\n", s, len);
+
+    const u8 *min_accel_offset = c;
+    if (!m->has_accel || len < ACCEL_MIN_LEN) {
+        min_accel_offset = c_end;
+        goto without_accel;
+    }
+
+    goto with_accel;
+
+without_accel:
+    do {
+        assert(c < min_accel_offset);
+        int do_accept;
+        if (!s) {
+            goto exit;
+        } else if (s < sheng_end) {
+            s = doSheng(m, &c, min_accel_offset, c_end, s, 0);
+            do_accept = mode != NO_MATCHES && get_aux(m, s)->accept;
+        } else {
+            s = doNormal16(m, &c, min_accel_offset, s, 0, mode);
+
+            do_accept = mode != NO_MATCHES && (s & ACCEPT_FLAG);
+        }
+
+        if (do_accept) {
+            if (mode == STOP_AT_MATCH) {
+                *state = s & STATE_MASK;
+                *c_final = c - 1;
+                return MO_MATCHES_PENDING;
+            }
+
+            u64a loc = (c - 1) - buf + offAdj + 1;
+
+            if (single) {
+                DEBUG_PRINTF("reporting %u\n", m->arb_report);
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                    return MO_DEAD; /* termination requested */
+                }
+            } else if (doComplexReport(cb, ctxt, m, s & STATE_MASK, loc, 0,
+                                       &cached_accept_state, &cached_accept_id)
+                       == MO_HALT_MATCHING) {
+                return MO_DEAD;
+            }
+        }
+
+        assert(c <= c_end); /* sheng is fuzzy for min_accel_offset */
+    } while (c < min_accel_offset);
+
+    if (c == c_end) {
+        goto exit;
+    }
+
+with_accel:
+    do {
+        assert(c < c_end);
+        int do_accept;
+
+        if (!s) {
+            goto exit;
+        } else if (s < sheng_end) {
+            if (s > m->sheng_accel_limit) {
+                c = run_mcsheng_accel(m, aux, s, &min_accel_offset, c, c_end);
+                if (c == c_end) {
+                    goto exit;
+                } else {
+                    goto without_accel;
+                }
+            }
+            s = doSheng(m, &c, c_end, c_end, s, 1);
+            do_accept = mode != NO_MATCHES && get_aux(m, s)->accept;
+        } else {
+            if (s & ACCEL_FLAG) {
+                DEBUG_PRINTF("skipping\n");
+                s &= STATE_MASK;
+                c = run_mcsheng_accel(m, aux, s, &min_accel_offset, c, c_end);
+                if (c == c_end) {
+                    goto exit;
+                } else {
+                    goto without_accel;
+                }
+            }
+
+            s = doNormal16(m, &c, c_end, s, 1, mode);
+            do_accept = mode != NO_MATCHES && (s & ACCEPT_FLAG);
+        }
+
+        if (do_accept) {
+            if (mode == STOP_AT_MATCH) {
+                *state = s & STATE_MASK;
+                *c_final = c - 1;
+                return MO_MATCHES_PENDING;
+            }
+
+            u64a loc = (c - 1) - buf + offAdj + 1;
+
+            if (single) {
+                DEBUG_PRINTF("reporting %u\n", m->arb_report);
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                    return MO_DEAD; /* termination requested */
+                }
+            } else if (doComplexReport(cb, ctxt, m, s & STATE_MASK, loc, 0,
+                                       &cached_accept_state, &cached_accept_id)
+                       == MO_HALT_MATCHING) {
+                return MO_DEAD;
+            }
+        }
+
+        assert(c <= c_end);
+    } while (c < c_end);
+
+exit:
+    s &= STATE_MASK;
+
+    if (mode == STOP_AT_MATCH) {
+        *c_final = c_end;
+    }
+    *state = s;
+
+    return MO_ALIVE;
+}
+
+static never_inline
+char mcshengExec16_i_cb(const struct mcsheng *m, u32 *state, const u8 *buf,
+                        size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                        char single, const u8 **final_point) {
+    return mcshengExec16_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                           final_point, CALLBACK_OUTPUT);
+}
+
+static never_inline
+char mcshengExec16_i_sam(const struct mcsheng *m, u32 *state, const u8 *buf,
+                         size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                         char single, const u8 **final_point) {
+    return mcshengExec16_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                           final_point, STOP_AT_MATCH);
+}
+
+static never_inline
+char mcshengExec16_i_nm(const struct mcsheng *m, u32 *state, const u8 *buf,
+                        size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                        char single, const u8 **final_point) {
+    return mcshengExec16_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                           final_point, NO_MATCHES);
+}
+
+static really_inline
+char mcshengExec16_i_ni(const struct mcsheng *m, u32 *state, const u8 *buf,
+                        size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                        char single, const u8 **final_point,
+                        enum MatchMode mode) {
+    if (mode == CALLBACK_OUTPUT) {
+        return mcshengExec16_i_cb(m, state, buf, len, offAdj, cb, ctxt,
+                                  single, final_point);
+    } else if (mode == STOP_AT_MATCH) {
+        return mcshengExec16_i_sam(m, state, buf, len, offAdj, cb, ctxt,
+                                   single, final_point);
+    } else {
+        assert (mode == NO_MATCHES);
+        return mcshengExec16_i_nm(m, state, buf, len, offAdj, cb, ctxt,
+                                  single, final_point);
+    }
+}
+
+static really_inline
+u32 doNormal8(const struct mcsheng *m, const u8 **c_inout, const u8 *end, u32 s,
+              char do_accel, enum MatchMode mode) {
+    const u8 *c = *c_inout;
+    u32 sheng_end = m->sheng_end;
+    u32 accel_limit = m->accel_limit_8;
+    u32 accept_limit = m->accept_limit_8;
+
+    const u32 as = m->alphaShift;
+    const u8 *succ_table = (const u8 *)((const char *)m
+                                        + sizeof(struct mcsheng));
+    /* Adjust start of succ table so we can index into using state id (rather
+     * than adjust to normal id). As we will not be processing states with low
+     * state ids, we will not be accessing data before the succ table. Note: due
+     * to the size of the sheng tables, the succ_table pointer will still be
+     * inside the engine.*/
+    succ_table -= sheng_end << as;
+
+    assert(s >= sheng_end);
+
+    while (c < end && s >= sheng_end) {
+        u8 cprime = m->remap[*c];
+        DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx\n", *c,
+                     ourisprint(*c) ? *c : '?', cprime);
+        s = succ_table[(s << as) + cprime];
+
+        DEBUG_PRINTF("s: %u\n", s);
+        c++;
+        if (do_accel) {
+            if (s >= accel_limit) {
+                break;
+            }
+        } else {
+            if (mode != NO_MATCHES && s >= accept_limit) {
+                break;
+            }
+        }
+    }
+    *c_inout = c;
+    return s;
+}
+
+static really_inline
+char mcshengExec8_i(const struct mcsheng *m, u32 *state, const u8 *buf,
+                    size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                    char single, const u8 **c_final, enum MatchMode mode) {
+    if (!len) {
+        *c_final = buf;
+        return MO_ALIVE;
+    }
+    u32 s = *state;
+    const u8 *c = buf;
+    const u8 *c_end = buf + len;
+    const u8 sheng_end = m->sheng_end;
+
+    const struct mstate_aux *aux
+        = (const struct mstate_aux *)((const char *)m + m->aux_offset
+                                      - sizeof(struct NFA));
+    u32 accept_limit = m->accept_limit_8;
+
+    u32 cached_accept_id = 0;
+    u32 cached_accept_state = 0;
+
+    DEBUG_PRINTF("accel %hu, accept %u\n", m->accel_limit_8, accept_limit);
+
+    DEBUG_PRINTF("s: %u, len %zu\n", s, len);
+
+    const u8 *min_accel_offset = c;
+    if (!m->has_accel || len < ACCEL_MIN_LEN) {
+        min_accel_offset = c_end;
+        goto without_accel;
+    }
+
+    goto with_accel;
+
+without_accel:
+    do {
+        assert(c < min_accel_offset);
+        if (!s) {
+            goto exit;
+        } else if (s < sheng_end) {
+            s = doSheng(m, &c, min_accel_offset, c_end, s, 0);
+        } else {
+            s = doNormal8(m, &c, min_accel_offset, s, 0, mode);
+            assert(c <= min_accel_offset);
+        }
+
+        if (mode != NO_MATCHES && s >= accept_limit) {
+            if (mode == STOP_AT_MATCH) {
+                DEBUG_PRINTF("match - pausing\n");
+                *state = s;
+                *c_final = c - 1;
+                return MO_MATCHES_PENDING;
+            }
+
+            u64a loc = (c - 1) - buf + offAdj + 1;
+            if (single) {
+                DEBUG_PRINTF("reporting %u\n", m->arb_report);
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                    return MO_DEAD;
+                }
+            } else if (doComplexReport(cb, ctxt, m, s, loc, 0,
+                                       &cached_accept_state, &cached_accept_id)
+                       == MO_HALT_MATCHING) {
+                return MO_DEAD;
+            }
+        }
+
+        assert(c <= c_end); /* sheng is fuzzy for min_accel_offset */
+    } while (c < min_accel_offset);
+
+    if (c == c_end) {
+        goto exit;
+    }
+
+with_accel:
+    do {
+        u32 accel_limit = m->accel_limit_8;
+
+        assert(c < c_end);
+        if (!s) {
+            goto exit;
+        } else if (s < sheng_end) {
+            if (s > m->sheng_accel_limit) {
+                c = run_mcsheng_accel(m, aux, s, &min_accel_offset, c, c_end);
+                if (c == c_end) {
+                    goto exit;
+                } else {
+                    goto without_accel;
+                }
+            }
+            s = doSheng(m, &c, c_end, c_end, s, 1);
+        } else {
+            if (s >= accel_limit && aux[s].accel_offset) {
+                c = run_mcsheng_accel(m, aux, s, &min_accel_offset, c, c_end);
+                if (c == c_end) {
+                    goto exit;
+                } else {
+                    goto without_accel;
+                }
+            }
+            s = doNormal8(m, &c, c_end, s, 1, mode);
+        }
+
+        if (mode != NO_MATCHES && s >= accept_limit) {
+            if (mode == STOP_AT_MATCH) {
+                DEBUG_PRINTF("match - pausing\n");
+                *state = s;
+                *c_final = c - 1;
+                return MO_MATCHES_PENDING;
+            }
+
+            u64a loc = (c - 1) - buf + offAdj + 1;
+            if (single) {
+                DEBUG_PRINTF("reporting %u\n", m->arb_report);
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                    return MO_DEAD;
+                }
+            } else if (doComplexReport(cb, ctxt, m, s, loc, 0,
+                                       &cached_accept_state, &cached_accept_id)
+                       == MO_HALT_MATCHING) {
+                return MO_DEAD;
+            }
+        }
+
+        assert(c <= c_end);
+    } while (c < c_end);
+
+exit:
+    *state = s;
+    if (mode == STOP_AT_MATCH) {
+        *c_final = c_end;
+    }
+    return MO_ALIVE;
+}
+
+static never_inline
+char mcshengExec8_i_cb(const struct mcsheng *m, u32 *state, const u8 *buf,
+                       size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                       char single, const u8 **final_point) {
+    return mcshengExec8_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                          final_point, CALLBACK_OUTPUT);
+}
+
+static never_inline
+char mcshengExec8_i_sam(const struct mcsheng *m, u32 *state, const u8 *buf,
+                        size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                        char single, const u8 **final_point) {
+    return mcshengExec8_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                          final_point, STOP_AT_MATCH);
+}
+
+static never_inline
+char mcshengExec8_i_nm(const struct mcsheng *m, u32 *state, const u8 *buf,
+                       size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                       char single, const u8 **final_point) {
+    return mcshengExec8_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                          final_point, NO_MATCHES);
+}
+
+static really_inline
+char mcshengExec8_i_ni(const struct mcsheng *m, u32 *state, const u8 *buf,
+                       size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                       char single, const u8 **final_point,
+                       enum MatchMode mode) {
+    if (mode == CALLBACK_OUTPUT) {
+        return mcshengExec8_i_cb(m, state, buf, len, offAdj, cb, ctxt, single,
+                                 final_point);
+    } else if (mode == STOP_AT_MATCH) {
+        return mcshengExec8_i_sam(m, state, buf, len, offAdj, cb, ctxt,
+                                  single, final_point);
+    } else {
+        assert(mode == NO_MATCHES);
+        return mcshengExec8_i_nm(m, state, buf, len, offAdj, cb, ctxt, single,
+                                 final_point);
+    }
+}
+
+static really_inline
+char mcshengCheckEOD(const struct NFA *nfa, u32 s, u64a offset,
+                     NfaCallback cb, void *ctxt) {
+    const struct mcsheng *m = getImplNfa(nfa);
+    const struct mstate_aux *aux = get_aux(m, s);
+
+    if (!aux->accept_eod) {
+        return MO_CONTINUE_MATCHING;
+    }
+    return doComplexReport(cb, ctxt, m, s, offset, 1, NULL, NULL);
+}
+
+static really_inline
+char nfaExecMcSheng16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
+                          const u8 *hend, NfaCallback cb, void *context,
+                          struct mq *q, char single, s64a end,
+                          enum MatchMode mode) {
+    assert(n->type == MCSHENG_NFA_16);
+    const struct mcsheng *m = getImplNfa(n);
+    s64a sp;
+
+    assert(ISALIGNED_N(q->state, 2));
+    u32 s = *(u16 *)q->state;
+
+    if (q->report_current) {
+        assert(s);
+        assert(get_aux(m, s)->accept);
+
+        int rv;
+        if (single) {
+            DEBUG_PRINTF("reporting %u\n", m->arb_report);
+            rv = cb(0, q_cur_offset(q), m->arb_report, context);
+        } else {
+            u32 cached_accept_id = 0;
+            u32 cached_accept_state = 0;
+
+            rv = doComplexReport(cb, context, m, s, q_cur_offset(q), 0,
+                                 &cached_accept_state, &cached_accept_id);
+        }
+
+        q->report_current = 0;
+
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+    }
+
+    sp = q_cur_loc(q);
+    q->cur++;
+
+    const u8 *cur_buf = sp < 0 ? hend : buffer;
+
+    assert(q->cur);
+    if (mode != NO_MATCHES && q->items[q->cur - 1].location > end) {
+        DEBUG_PRINTF("this is as far as we go\n");
+        q->cur--;
+        q->items[q->cur].type = MQE_START;
+        q->items[q->cur].location = end;
+        *(u16 *)q->state = s;
+        return MO_ALIVE;
+    }
+
+    while (1) {
+        assert(q->cur < q->end);
+        s64a ep = q->items[q->cur].location;
+        if (mode != NO_MATCHES) {
+            ep = MIN(ep, end);
+        }
+
+        assert(ep >= sp);
+
+        s64a local_ep = ep;
+        if (sp < 0) {
+            local_ep = MIN(0, ep);
+        }
+
+        /* do main buffer region */
+        const u8 *final_look;
+        char rv = mcshengExec16_i_ni(m, &s, cur_buf + sp, local_ep - sp,
+                                     offset + sp, cb, context, single,
+                                     &final_look, mode);
+        if (rv == MO_DEAD) {
+            *(u16 *)q->state = 0;
+            return MO_DEAD;
+        }
+        if (mode == STOP_AT_MATCH && rv == MO_MATCHES_PENDING) {
+            DEBUG_PRINTF("this is as far as we go\n");
+            DEBUG_PRINTF("state %u final_look %zd\n", s, final_look - cur_buf);
+
+            assert(q->cur);
+            assert(final_look != cur_buf + local_ep);
+
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = final_look - cur_buf + 1; /* due to
+                                                                   * early -1 */
+            *(u16 *)q->state = s;
+            return MO_MATCHES_PENDING;
+        }
+
+        assert(rv == MO_ALIVE);
+        assert(q->cur);
+        if (mode != NO_MATCHES && q->items[q->cur].location > end) {
+            DEBUG_PRINTF("this is as far as we go\n");
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = end;
+            *(u16 *)q->state = s;
+            return MO_ALIVE;
+        }
+
+        sp = local_ep;
+
+        if (sp == 0) {
+            cur_buf = buffer;
+        }
+
+        if (sp != ep) {
+            continue;
+        }
+
+        switch (q->items[q->cur].type) {
+        case MQE_TOP:
+            assert(sp + offset || !s);
+            if (sp + offset == 0) {
+                s = m->start_anchored;
+                break;
+            }
+            s = mcshengEnableStarts(m, s);
+            break;
+        case MQE_END:
+            *(u16 *)q->state = s;
+            q->cur++;
+            return s ? MO_ALIVE : MO_DEAD;
+        default:
+            assert(!"invalid queue event");
+        }
+
+        q->cur++;
+    }
+}
+
+static really_inline
+char nfaExecMcSheng8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
+                         const u8 *hend, NfaCallback cb, void *context,
+                         struct mq *q, char single, s64a end,
+                         enum MatchMode mode) {
+    assert(n->type == MCSHENG_NFA_8);
+    const struct mcsheng *m = getImplNfa(n);
+    s64a sp;
+
+    u32 s = *(u8 *)q->state;
+
+    if (q->report_current) {
+        assert(s);
+        assert(s >= m->accept_limit_8);
+
+        int rv;
+        if (single) {
+            DEBUG_PRINTF("reporting %u\n", m->arb_report);
+            rv = cb(0, q_cur_offset(q), m->arb_report, context);
+        } else {
+            u32 cached_accept_id = 0;
+            u32 cached_accept_state = 0;
+
+            rv = doComplexReport(cb, context, m, s, q_cur_offset(q), 0,
+                                 &cached_accept_state, &cached_accept_id);
+        }
+
+        q->report_current = 0;
+
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+    }
+
+    sp = q_cur_loc(q);
+    q->cur++;
+
+    const u8 *cur_buf = sp < 0 ? hend : buffer;
+
+    if (mode != NO_MATCHES && q->items[q->cur - 1].location > end) {
+        DEBUG_PRINTF("this is as far as we go\n");
+        q->cur--;
+        q->items[q->cur].type = MQE_START;
+        q->items[q->cur].location = end;
+        *(u8 *)q->state = s;
+        return MO_ALIVE;
+    }
+
+    while (1) {
+        DEBUG_PRINTF("%s @ %llu\n", q->items[q->cur].type == MQE_TOP ? "TOP" :
+                     q->items[q->cur].type == MQE_END ? "END" : "???",
+                     q->items[q->cur].location + offset);
+        assert(q->cur < q->end);
+        s64a ep = q->items[q->cur].location;
+        if (mode != NO_MATCHES) {
+            ep = MIN(ep, end);
+        }
+
+        assert(ep >= sp);
+
+        s64a local_ep = ep;
+        if (sp < 0) {
+            local_ep = MIN(0, ep);
+        }
+
+        const u8 *final_look;
+        char rv = mcshengExec8_i_ni(m, &s, cur_buf + sp, local_ep - sp,
+                                    offset + sp, cb, context, single,
+                                    &final_look, mode);
+        if (rv == MO_HALT_MATCHING) {
+            *(u8 *)q->state = 0;
+            return MO_DEAD;
+        }
+        if (mode == STOP_AT_MATCH && rv == MO_MATCHES_PENDING) {
+            DEBUG_PRINTF("this is as far as we go\n");
+            DEBUG_PRINTF("state %u final_look %zd\n", s, final_look - cur_buf);
+
+            assert(q->cur);
+            assert(final_look != cur_buf + local_ep);
+
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = final_look - cur_buf + 1; /* due to
+                                                                   * early -1 */
+            *(u8 *)q->state = s;
+            return MO_MATCHES_PENDING;
+        }
+
+        assert(rv == MO_ALIVE);
+        assert(q->cur);
+        if (mode != NO_MATCHES && q->items[q->cur].location > end) {
+            DEBUG_PRINTF("this is as far as we go\n");
+            assert(q->cur);
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = end;
+            *(u8 *)q->state = s;
+            return MO_ALIVE;
+        }
+
+        sp = local_ep;
+
+        if (sp == 0) {
+            cur_buf = buffer;
+        }
+
+        if (sp != ep) {
+            continue;
+        }
+
+        switch (q->items[q->cur].type) {
+        case MQE_TOP:
+            assert(sp + offset || !s);
+            if (sp + offset == 0) {
+                s = (u8)m->start_anchored;
+                break;
+            }
+            s = mcshengEnableStarts(m, s);
+            break;
+        case MQE_END:
+            *(u8 *)q->state = s;
+            q->cur++;
+            return s ? MO_ALIVE : MO_DEAD;
+        default:
+            assert(!"invalid queue event");
+        }
+
+        q->cur++;
+    }
+}
+
+char nfaExecMcSheng8_Q(const struct NFA *n, struct mq *q, s64a end) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_NFA_8);
+    const struct mcsheng *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    return nfaExecMcSheng8_Q2i(n, offset, buffer, hend, cb, context, q,
+                               m->flags & MCSHENG_FLAG_SINGLE, end,
+                               CALLBACK_OUTPUT);
+}
+
+char nfaExecMcSheng16_Q(const struct NFA *n, struct mq *q, s64a end) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_NFA_16);
+    const struct mcsheng *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    return nfaExecMcSheng16_Q2i(n, offset, buffer, hend, cb, context, q,
+                                m->flags & MCSHENG_FLAG_SINGLE, end,
+                                CALLBACK_OUTPUT);
+}
+
+char nfaExecMcSheng8_reportCurrent(const struct NFA *n, struct mq *q) {
+    const struct mcsheng *m = getImplNfa(n);
+    NfaCallback cb = q->cb;
+    void *ctxt = q->context;
+    u32 s = *(u8 *)q->state;
+    u8 single = m->flags & MCSHENG_FLAG_SINGLE;
+    u64a offset = q_cur_offset(q);
+    assert(q_cur_type(q) == MQE_START);
+    assert(s);
+
+    if (s >= m->accept_limit_8) {
+        if (single) {
+            DEBUG_PRINTF("reporting %u\n", m->arb_report);
+            cb(0, offset, m->arb_report, ctxt);
+        } else {
+            u32 cached_accept_id = 0;
+            u32 cached_accept_state = 0;
+
+            doComplexReport(cb, ctxt, m, s, offset, 0, &cached_accept_state,
+                            &cached_accept_id);
+        }
+    }
+
+    return 0;
+}
+
+char nfaExecMcSheng16_reportCurrent(const struct NFA *n, struct mq *q) {
+    const struct mcsheng *m = getImplNfa(n);
+    NfaCallback cb = q->cb;
+    void *ctxt = q->context;
+    u32 s = *(u16 *)q->state;
+    const struct mstate_aux *aux = get_aux(m, s);
+    u8 single = m->flags & MCSHENG_FLAG_SINGLE;
+    u64a offset = q_cur_offset(q);
+    assert(q_cur_type(q) == MQE_START);
+    DEBUG_PRINTF("state %u\n", s);
+    assert(s);
+
+    if (aux->accept) {
+        if (single) {
+            DEBUG_PRINTF("reporting %u\n", m->arb_report);
+            cb(0, offset, m->arb_report, ctxt);
+        } else {
+            u32 cached_accept_id = 0;
+            u32 cached_accept_state = 0;
+
+            doComplexReport(cb, ctxt, m, s, offset, 0, &cached_accept_state,
+                            &cached_accept_id);
+        }
+    }
+
+    return 0;
+}
+
+static
+char mcshengHasAccept(const struct mcsheng *m, const struct mstate_aux *aux,
+                      ReportID report) {
+    assert(m && aux);
+
+    if (!aux->accept) {
+        return 0;
+    }
+
+    const struct report_list *rl = (const struct report_list *)
+            ((const char *)m + aux->accept - sizeof(struct NFA));
+    assert(ISALIGNED_N(rl, 4));
+
+    DEBUG_PRINTF("report list has %u entries\n", rl->count);
+
+    for (u32 i = 0; i < rl->count; i++) {
+        if (rl->report[i] == report) {
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
+char nfaExecMcSheng8_inAccept(const struct NFA *n, ReportID report,
+                              struct mq *q) {
+    assert(n && q);
+
+    const struct mcsheng *m = getImplNfa(n);
+    u8 s = *(u8 *)q->state;
+    DEBUG_PRINTF("checking accepts for %hhu\n", s);
+
+    return mcshengHasAccept(m, get_aux(m, s), report);
+}
+
+char nfaExecMcSheng8_inAnyAccept(const struct NFA *n, struct mq *q) {
+    assert(n && q);
+
+    const struct mcsheng *m = getImplNfa(n);
+    u8 s = *(u8 *)q->state;
+    DEBUG_PRINTF("checking accepts for %hhu\n", s);
+
+    return !!get_aux(m, s)->accept;
+}
+
+char nfaExecMcSheng16_inAccept(const struct NFA *n, ReportID report,
+                               struct mq *q) {
+    assert(n && q);
+
+    const struct mcsheng *m = getImplNfa(n);
+    u16 s = *(u16 *)q->state;
+    DEBUG_PRINTF("checking accepts for %hu\n", s);
+
+    return mcshengHasAccept(m, get_aux(m, s), report);
+}
+
+char nfaExecMcSheng16_inAnyAccept(const struct NFA *n, struct mq *q) {
+    assert(n && q);
+
+    const struct mcsheng *m = getImplNfa(n);
+    u16 s = *(u16 *)q->state;
+    DEBUG_PRINTF("checking accepts for %hu\n", s);
+
+    return !!get_aux(m, s)->accept;
+}
+
+char nfaExecMcSheng8_Q2(const struct NFA *n, struct mq *q, s64a end) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_NFA_8);
+    const struct mcsheng *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    return nfaExecMcSheng8_Q2i(n, offset, buffer, hend, cb, context, q,
+                               m->flags & MCSHENG_FLAG_SINGLE, end,
+                               STOP_AT_MATCH);
+}
+
+char nfaExecMcSheng16_Q2(const struct NFA *n, struct mq *q, s64a end) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_NFA_16);
+    const struct mcsheng *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    return nfaExecMcSheng16_Q2i(n, offset, buffer, hend, cb, context, q,
+                                m->flags & MCSHENG_FLAG_SINGLE, end,
+                                STOP_AT_MATCH);
+}
+
+char nfaExecMcSheng8_QR(const struct NFA *n, struct mq *q, ReportID report) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_NFA_8);
+    const struct mcsheng *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    char rv = nfaExecMcSheng8_Q2i(n, offset, buffer, hend, cb, context, q,
+                                  m->flags & MCSHENG_FLAG_SINGLE, 0 /* end */,
+                                  NO_MATCHES);
+    if (rv && nfaExecMcSheng8_inAccept(n, report, q)) {
+        return MO_MATCHES_PENDING;
+    } else {
+        return rv;
+    }
+}
+
+char nfaExecMcSheng16_QR(const struct NFA *n, struct mq *q, ReportID report) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_NFA_16);
+    const struct mcsheng *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    char rv = nfaExecMcSheng16_Q2i(n, offset, buffer, hend, cb, context, q,
+                                   m->flags & MCSHENG_FLAG_SINGLE, 0 /* end */,
+                                   NO_MATCHES);
+
+    if (rv && nfaExecMcSheng16_inAccept(n, report, q)) {
+        return MO_MATCHES_PENDING;
+    } else {
+        return rv;
+    }
+}
+
+char nfaExecMcSheng8_initCompressedState(const struct NFA *nfa, u64a offset,
+                                         void *state, UNUSED u8 key) {
+    const struct mcsheng *m = getImplNfa(nfa);
+    u8 s = offset ? m->start_floating : m->start_anchored;
+    if (s) {
+        *(u8 *)state = s;
+        return 1;
+    }
+    return 0;
+}
+
+char nfaExecMcSheng16_initCompressedState(const struct NFA *nfa, u64a offset,
+                                          void *state, UNUSED u8 key) {
+    const struct mcsheng *m = getImplNfa(nfa);
+    u16 s = offset ? m->start_floating : m->start_anchored;
+    if (s) {
+        unaligned_store_u16(state, s);
+        return 1;
+    }
+    return 0;
+}
+
+char nfaExecMcSheng8_testEOD(const struct NFA *nfa, const char *state,
+                             UNUSED const char *streamState, u64a offset,
+                             NfaCallback callback, void *context) {
+    return mcshengCheckEOD(nfa, *(const u8 *)state, offset, callback,
+                           context);
+}
+
+char nfaExecMcSheng16_testEOD(const struct NFA *nfa, const char *state,
+                              UNUSED const char *streamState, u64a offset,
+                              NfaCallback callback, void *context) {
+    assert(ISALIGNED_N(state, 2));
+    return mcshengCheckEOD(nfa, *(const u16 *)state, offset, callback,
+                           context);
+}
+
+char nfaExecMcSheng8_queueInitState(UNUSED const struct NFA *nfa, struct mq *q) {
+    assert(nfa->scratchStateSize == 1);
+    *(u8 *)q->state = 0;
+    return 0;
+}
+
+char nfaExecMcSheng16_queueInitState(UNUSED const struct NFA *nfa, struct mq *q) {
+    assert(nfa->scratchStateSize == 2);
+    assert(ISALIGNED_N(q->state, 2));
+    *(u16 *)q->state = 0;
+    return 0;
+}
+
+char nfaExecMcSheng8_queueCompressState(UNUSED const struct NFA *nfa,
+                                        const struct mq *q, UNUSED s64a loc) {
+    void *dest = q->streamState;
+    const void *src = q->state;
+    assert(nfa->scratchStateSize == 1);
+    assert(nfa->streamStateSize == 1);
+    *(u8 *)dest = *(const u8 *)src;
+    return 0;
+}
+
+char nfaExecMcSheng8_expandState(UNUSED const struct NFA *nfa, void *dest,
+                                 const void *src, UNUSED u64a offset,
+                                 UNUSED u8 key) {
+    assert(nfa->scratchStateSize == 1);
+    assert(nfa->streamStateSize == 1);
+    *(u8 *)dest = *(const u8 *)src;
+    return 0;
+}
+
+char nfaExecMcSheng16_queueCompressState(UNUSED const struct NFA *nfa,
+                                         const struct mq *q,
+                                         UNUSED s64a loc) {
+    void *dest = q->streamState;
+    const void *src = q->state;
+    assert(nfa->scratchStateSize == 2);
+    assert(nfa->streamStateSize == 2);
+    assert(ISALIGNED_N(src, 2));
+    unaligned_store_u16(dest, *(const u16 *)(src));
+    return 0;
+}
+
+char nfaExecMcSheng16_expandState(UNUSED const struct NFA *nfa, void *dest,
+                                  const void *src, UNUSED u64a offset,
+                                  UNUSED u8 key) {
+    assert(nfa->scratchStateSize == 2);
+    assert(nfa->streamStateSize == 2);
+    assert(ISALIGNED_N(dest, 2));
+    *(u16 *)dest = unaligned_load_u16(src);
+    return 0;
+}
+
+#if defined(HAVE_AVX512VBMI)
+static really_inline
+const struct mstate_aux *get_aux64(const struct mcsheng64 *m, u32 s) {
+    const char *nfa = (const char *)m - sizeof(struct NFA);
+    const struct mstate_aux *aux
+        = s + (const struct mstate_aux *)(nfa + m->aux_offset);
+
+    assert(ISALIGNED(aux));
+    return aux;
+}
+
+static really_inline
+u32 mcshengEnableStarts64(const struct mcsheng64 *m, u32 s) {
+    const struct mstate_aux *aux = get_aux64(m, s);
+
+    DEBUG_PRINTF("enabling starts %u->%hu\n", s, aux->top);
+    return aux->top;
+}
+
+static really_inline
+char doComplexReport64(NfaCallback cb, void *ctxt, const struct mcsheng64 *m,
+                       u32 s, u64a loc, char eod, u32 *cached_accept_state,
+                       u32 *cached_accept_id) {
+    DEBUG_PRINTF("reporting state = %u, loc=%llu, eod %hhu\n",
+                 s & STATE_MASK, loc, eod);
+
+    if (!eod && s == *cached_accept_state) {
+        if (cb(0, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+
+        return MO_CONTINUE_MATCHING; /* continue execution */
+    }
+
+    const struct mstate_aux *aux = get_aux64(m, s);
+    size_t offset = eod ? aux->accept_eod : aux->accept;
+
+    assert(offset);
+    const struct report_list *rl
+        = (const void *)((const char *)m + offset - sizeof(struct NFA));
+    assert(ISALIGNED(rl));
+
+    DEBUG_PRINTF("report list size %u\n", rl->count);
+    u32 count = rl->count;
+
+    if (!eod && count == 1) {
+        *cached_accept_state = s;
+        *cached_accept_id = rl->report[0];
+
+        DEBUG_PRINTF("reporting %u\n", rl->report[0]);
+        if (cb(0, loc, rl->report[0], ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+
+        return MO_CONTINUE_MATCHING; /* continue execution */
+    }
+
+    for (u32 i = 0; i < count; i++) {
+        DEBUG_PRINTF("reporting %u\n", rl->report[i]);
+        if (cb(0, loc, rl->report[i], ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+    }
+
+    return MO_CONTINUE_MATCHING; /* continue execution */
+}
+
+static really_inline
+u32 doSheng64(const struct mcsheng64 *m, const u8 **c_inout, const u8 *soft_c_end,
+              const u8 *hard_c_end, u32 s_in, char do_accel) {
+    assert(s_in < m->sheng_end);
+    assert(s_in); /* should not already be dead */
+    assert(soft_c_end <= hard_c_end);
+    DEBUG_PRINTF("s_in = %u (adjusted %u)\n", s_in, s_in - 1);
+    m512 s = set64x8(s_in - 1);
+    const u8 *c = *c_inout;
+    const u8 *c_end = hard_c_end - SHENG_CHUNK + 1;
+    if (!do_accel) {
+        c_end = MIN(soft_c_end, hard_c_end - SHENG_CHUNK + 1);
+    }
+
+    const m512 *masks = m->sheng_succ_masks;
+    u8 sheng_limit = m->sheng_end - 1; /* - 1: no dead state */
+    u8 sheng_stop_limit = do_accel ? m->sheng_accel_limit : sheng_limit;
+
+    /* When we use movd to get a u32 containing our state, it will have 4 lanes
+     * all duplicating the state. We can create versions of our limits with 4
+     * copies to directly compare against, this prevents us generating code to
+     * extract a single copy of the state from the u32 for checking. */
+    u32 sheng_stop_limit_x4 = sheng_stop_limit * 0x01010101;
+
+#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
+    u32 sheng_limit_x4 = sheng_limit * 0x01010101;
+    m512 simd_stop_limit = set16x32(sheng_stop_limit_x4);
+    m512 accel_delta = set64x8(sheng_limit - sheng_stop_limit);
+    DEBUG_PRINTF("end %hhu, accel %hu --> limit %hhu\n", sheng_limit,
+                 m->sheng_accel_limit, sheng_stop_limit);
+#endif
+
+#define SHENG64_SINGLE_ITER do {                                             \
+        m512 succ_mask = masks[*(c++)];                                      \
+        s = vpermb512(s, succ_mask);                                         \
+        u32 s_gpr_x4 = movd512(s); /* convert to u8 */                       \
+        DEBUG_PRINTF("c %hhu (%c) --> s %u\n", c[-1], c[-1], s_gpr_x4);      \
+        if (s_gpr_x4 >= sheng_stop_limit_x4) {                               \
+            s_gpr = s_gpr_x4;                                                \
+            goto exit;                                                       \
+        }                                                                    \
+    } while (0)
+
+    u8 s_gpr;
+    while (c < c_end) {
+#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
+        /* This version uses pext for efficiently bitbashing out scaled
+         * versions of the bytes to process from a u64a */
+
+        u64a data_bytes = unaligned_load_u64a(c);
+        u64a cc0 = pdep64(data_bytes, 0x3fc0); /* extract scaled low byte */
+        data_bytes &= ~0xffULL; /* clear low bits for scale space */
+
+        m512 succ_mask0 = load512((const char *)masks + cc0);
+        s = vpermb512(s, succ_mask0);
+        m512 s_max = s;
+        m512 s_max0 = s_max;
+        DEBUG_PRINTF("c %02llx --> s %u\n", cc0 >> 6, movd512(s));
+
+#define SHENG64_SINGLE_UNROLL_ITER(iter)                                \
+        assert(iter);                                                   \
+        u64a cc##iter = pext64(data_bytes, mcsheng64_pext_mask[iter]);  \
+        assert(cc##iter == (u64a)c[iter] << 6);                         \
+        m512 succ_mask##iter = load512((const char *)masks + cc##iter); \
+        s = vpermb512(s, succ_mask##iter);                              \
+        if (do_accel && iter == 7) {                                    \
+            /* in the final iteration we also have to check against accel */ \
+            m512 s_temp = sadd_u8_m512(s, accel_delta);                 \
+            s_max = max_u8_m512(s_max, s_temp);                         \
+        } else {                                                        \
+            s_max = max_u8_m512(s_max, s);                              \
+        }                                                               \
+        m512 s_max##iter = s_max;                                       \
+        DEBUG_PRINTF("c %02llx --> s %u max %u\n", cc##iter >> 6,       \
+                     movd512(s), movd512(s_max));
+
+        SHENG64_SINGLE_UNROLL_ITER(1);
+        SHENG64_SINGLE_UNROLL_ITER(2);
+        SHENG64_SINGLE_UNROLL_ITER(3);
+        SHENG64_SINGLE_UNROLL_ITER(4);
+        SHENG64_SINGLE_UNROLL_ITER(5);
+        SHENG64_SINGLE_UNROLL_ITER(6);
+        SHENG64_SINGLE_UNROLL_ITER(7);
+
+        if (movd512(s_max7) >= sheng_limit_x4) {
+            DEBUG_PRINTF("exit found\n");
+
+            /* Explicitly check the last byte as it is more likely as it also
+             * checks for acceleration. */
+            if (movd512(s_max6) < sheng_limit_x4) {
+                c += SHENG_CHUNK;
+                s_gpr = movq512(s);
+                assert(s_gpr >= sheng_stop_limit);
+                goto exit;
+            }
+
+            /* use shift-xor to create a register containing all of the max
+             * values */
+            m512 blended = rshift64_m512(s_max0, 56);
+            blended = xor512(blended, rshift64_m512(s_max1, 48));
+            blended = xor512(blended, rshift64_m512(s_max2, 40));
+            blended = xor512(blended, rshift64_m512(s_max3, 32));
+            blended = xor512(blended, rshift64_m512(s_max4, 24));
+            blended = xor512(blended, rshift64_m512(s_max5, 16));
+            blended = xor512(blended, rshift64_m512(s_max6, 8));
+            blended = xor512(blended, s);
+            blended = xor512(blended, rshift64_m512(blended, 8));
+            DEBUG_PRINTF("blended %016llx\n", movq512(blended));
+
+            m512 final = min_u8_m512(blended, simd_stop_limit);
+            m512 cmp = sub_u8_m512(final, simd_stop_limit);
+            m128 tmp = cast512to128(cmp);
+            u64a stops = ~movemask128(tmp);
+            assert(stops);
+            u32 earliest = ctz32(stops);
+            DEBUG_PRINTF("stops %02llx, earliest %u\n", stops, earliest);
+            assert(earliest < 8);
+            c += earliest + 1;
+            s_gpr = movq512(blended) >> (earliest * 8);
+            assert(s_gpr >= sheng_stop_limit);
+            goto exit;
+        } else {
+            c += SHENG_CHUNK;
+        }
+#else
+        SHENG64_SINGLE_ITER;
+        SHENG64_SINGLE_ITER;
+        SHENG64_SINGLE_ITER;
+        SHENG64_SINGLE_ITER;
+
+        SHENG64_SINGLE_ITER;
+        SHENG64_SINGLE_ITER;
+        SHENG64_SINGLE_ITER;
+        SHENG64_SINGLE_ITER;
+#endif
+    }
+
+    assert(c_end - c < SHENG_CHUNK);
+    if (c < soft_c_end) {
+        assert(soft_c_end - c < SHENG_CHUNK);
+        switch (soft_c_end - c) {
+        case 7:
+            SHENG64_SINGLE_ITER; // fallthrough
+        case 6:
+            SHENG64_SINGLE_ITER; // fallthrough
+        case 5:
+            SHENG64_SINGLE_ITER; // fallthrough
+        case 4:
+            SHENG64_SINGLE_ITER; // fallthrough
+        case 3:
+            SHENG64_SINGLE_ITER; // fallthrough
+        case 2:
+            SHENG64_SINGLE_ITER; // fallthrough
+        case 1:
+            SHENG64_SINGLE_ITER; // fallthrough
+        }
+    }
+
+    assert(c >= soft_c_end);
+
+    s_gpr = movq512(s);
+exit:
+    assert(c <= hard_c_end);
+    DEBUG_PRINTF("%zu from end; s %hhu\n", c_end - c, s_gpr);
+    assert(c >= soft_c_end || s_gpr >= sheng_stop_limit);
+    /* undo state adjustment to match mcclellan view */
+    if (s_gpr == sheng_limit) {
+        s_gpr = 0;
+    } else if (s_gpr < sheng_limit) {
+        s_gpr++;
+    }
+
+    *c_inout = c;
+    return s_gpr;
+}
+
+static really_inline
+const char *findShermanState64(UNUSED const struct mcsheng64 *m,
+                               const char *sherman_base_offset,
+                               u32 sherman_base, u32 s) {
+    const char *rv
+        = sherman_base_offset + SHERMAN_FIXED_SIZE * (s - sherman_base);
+    assert(rv < (const char *)m + m->length - sizeof(struct NFA));
+    UNUSED u8 type = *(const u8 *)(rv + SHERMAN_TYPE_OFFSET);
+    assert(type == SHERMAN_STATE);
+    return rv;
+}
+
+static really_inline
+const u8 *run_mcsheng_accel64(const struct mcsheng64 *m,
+                              const struct mstate_aux *aux, u32 s,
+                              const u8 **min_accel_offset,
+                              const u8 *c, const u8 *c_end) {
+    DEBUG_PRINTF("skipping\n");
+    u32 accel_offset = aux[s].accel_offset;
+
+    assert(aux[s].accel_offset);
+    assert(accel_offset >= m->aux_offset);
+    assert(!m->sherman_offset || accel_offset < m->sherman_offset);
+
+    const union AccelAux *aaux = (const void *)((const char *)m + accel_offset);
+    const u8 *c2 = run_accel(aaux, c, c_end);
+
+    if (c2 < *min_accel_offset + BAD_ACCEL_DIST) {
+        *min_accel_offset = c2 + BIG_ACCEL_PENALTY;
+    } else {
+        *min_accel_offset = c2 + SMALL_ACCEL_PENALTY;
+    }
+
+    if (*min_accel_offset >= c_end - ACCEL_MIN_LEN) {
+        *min_accel_offset = c_end;
+    }
+
+    DEBUG_PRINTF("advanced %zd, next accel chance in %zd/%zd\n",
+                 c2 - c, *min_accel_offset - c2, c_end - c2);
+
+    return c2;
+}
+
+static really_inline
+u32 doNormal64_16(const struct mcsheng64 *m, const u8 **c_inout, const u8 *end,
+                  u32 s, char do_accel, enum MatchMode mode) {
+    const u8 *c = *c_inout;
+    const u16 *succ_table
+        = (const u16 *)((const char *)m + sizeof(struct mcsheng64));
+    assert(ISALIGNED_N(succ_table, 2));
+    u32 sheng_end = m->sheng_end;
+    u32 sherman_base = m->sherman_limit;
+    const char *sherman_base_offset
+        = (const char *)m - sizeof(struct NFA) + m->sherman_offset;
+    u32 as = m->alphaShift;
+
+    /* Adjust start of succ table so we can index into using state id (rather
+     * than adjust to normal id). As we will not be processing states with low
+     * state ids, we will not be accessing data before the succ table. Note: due
+     * to the size of the sheng tables, the succ_table pointer will still be
+     * inside the engine.*/
+    succ_table -= sheng_end << as;
+    s &= STATE_MASK;
+    while (c < end && s >= sheng_end) {
+        u8 cprime = m->remap[*c];
+        DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx (s=%u)\n", *c,
+                     ourisprint(*c) ? *c : '?', cprime, s);
+        if (s < sherman_base) {
+            DEBUG_PRINTF("doing normal\n");
+            assert(s < m->state_count);
+            s = succ_table[(s << as) + cprime];
+        } else {
+            const char *sherman_state
+                = findShermanState64(m, sherman_base_offset, sherman_base, s);
+            DEBUG_PRINTF("doing sherman (%u)\n", s);
+            s = doSherman16(sherman_state, cprime, succ_table, as);
+        }
+
+        DEBUG_PRINTF("s: %u (%u)\n", s, s & STATE_MASK);
+        c++;
+
+        if (do_accel && (s & ACCEL_FLAG)) {
+            break;
+        }
+        if (mode != NO_MATCHES && (s & ACCEPT_FLAG)) {
+            break;
+        }
+
+        s &= STATE_MASK;
+    }
+
+    *c_inout = c;
+    return s;
+}
+
+static really_inline
+char mcsheng64Exec16_i(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+                       size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                       char single, const u8 **c_final, enum MatchMode mode) {
+    assert(ISALIGNED_N(state, 2));
+    if (!len) {
+        if (mode == STOP_AT_MATCH) {
+            *c_final = buf;
+        }
+        return MO_ALIVE;
+    }
+
+    u32 s = *state;
+    const u8 *c = buf;
+    const u8 *c_end = buf + len;
+    const u8 sheng_end = m->sheng_end;
+    const struct mstate_aux *aux
+        = (const struct mstate_aux *)((const char *)m + m->aux_offset
+                                      - sizeof(struct NFA));
+
+    s &= STATE_MASK;
+
+    u32 cached_accept_id = 0;
+    u32 cached_accept_state = 0;
+
+    DEBUG_PRINTF("s: %u, len %zu\n", s, len);
+
+    const u8 *min_accel_offset = c;
+    if (!m->has_accel || len < ACCEL_MIN_LEN) {
+        min_accel_offset = c_end;
+        goto without_accel;
+    }
+
+    goto with_accel;
+
+without_accel:
+    do {
+        assert(c < min_accel_offset);
+        int do_accept;
+        if (!s) {
+            goto exit;
+        } else if (s < sheng_end) {
+            s = doSheng64(m, &c, min_accel_offset, c_end, s, 0);
+            do_accept = mode != NO_MATCHES && get_aux64(m, s)->accept;
+        } else {
+            s = doNormal64_16(m, &c, min_accel_offset, s, 0, mode);
+
+            do_accept = mode != NO_MATCHES && (s & ACCEPT_FLAG);
+        }
+
+        if (do_accept) {
+            if (mode == STOP_AT_MATCH) {
+                *state = s & STATE_MASK;
+                *c_final = c - 1;
+                return MO_MATCHES_PENDING;
+            }
+
+            u64a loc = (c - 1) - buf + offAdj + 1;
+
+            if (single) {
+                DEBUG_PRINTF("reporting %u\n", m->arb_report);
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                    return MO_DEAD; /* termination requested */
+                }
+            } else if (doComplexReport64(cb, ctxt, m, s & STATE_MASK, loc, 0,
+                                         &cached_accept_state,
+                                         &cached_accept_id)
+                       == MO_HALT_MATCHING) {
+                return MO_DEAD;
+            }
+        }
+
+        assert(c <= c_end); /* sheng is fuzzy for min_accel_offset */
+    } while (c < min_accel_offset);
+
+    if (c == c_end) {
+        goto exit;
+    }
+
+with_accel:
+    do {
+        assert(c < c_end);
+        int do_accept;
+
+        if (!s) {
+            goto exit;
+        } else if (s < sheng_end) {
+            if (s > m->sheng_accel_limit) {
+                c = run_mcsheng_accel64(m, aux, s, &min_accel_offset, c, c_end);
+                if (c == c_end) {
+                    goto exit;
+                } else {
+                    goto without_accel;
+                }
+            }
+            s = doSheng64(m, &c, c_end, c_end, s, 1);
+            do_accept = mode != NO_MATCHES && get_aux64(m, s)->accept;
+        } else {
+            if (s & ACCEL_FLAG) {
+                DEBUG_PRINTF("skipping\n");
+                s &= STATE_MASK;
+                c = run_mcsheng_accel64(m, aux, s, &min_accel_offset, c, c_end);
+                if (c == c_end) {
+                    goto exit;
+                } else {
+                    goto without_accel;
+                }
+            }
+
+            s = doNormal64_16(m, &c, c_end, s, 1, mode);
+            do_accept = mode != NO_MATCHES && (s & ACCEPT_FLAG);
+        }
+
+        if (do_accept) {
+            if (mode == STOP_AT_MATCH) {
+                *state = s & STATE_MASK;
+                *c_final = c - 1;
+                return MO_MATCHES_PENDING;
+            }
+
+            u64a loc = (c - 1) - buf + offAdj + 1;
+
+            if (single) {
+                DEBUG_PRINTF("reporting %u\n", m->arb_report);
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                    return MO_DEAD; /* termination requested */
+                }
+            } else if (doComplexReport64(cb, ctxt, m, s & STATE_MASK, loc, 0,
+                                         &cached_accept_state,
+                                         &cached_accept_id)
+                       == MO_HALT_MATCHING) {
+                return MO_DEAD;
+            }
+        }
+
+        assert(c <= c_end);
+    } while (c < c_end);
+
+exit:
+    s &= STATE_MASK;
+
+    if (mode == STOP_AT_MATCH) {
+        *c_final = c_end;
+    }
+    *state = s;
+
+    return MO_ALIVE;
+}
+
+static never_inline
+char mcsheng64Exec16_i_cb(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+                          size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                          char single, const u8 **final_point) {
+    return mcsheng64Exec16_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                             final_point, CALLBACK_OUTPUT);
+}
+
+static never_inline
+char mcsheng64Exec16_i_sam(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+                           size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                           char single, const u8 **final_point) {
+    return mcsheng64Exec16_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                             final_point, STOP_AT_MATCH);
+}
+
+static never_inline
+char mcsheng64Exec16_i_nm(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+                          size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                          char single, const u8 **final_point) {
+    return mcsheng64Exec16_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                             final_point, NO_MATCHES);
+}
+
+static really_inline
+char mcsheng64Exec16_i_ni(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+                          size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                          char single, const u8 **final_point,
+                          enum MatchMode mode) {
+    if (mode == CALLBACK_OUTPUT) {
+        return mcsheng64Exec16_i_cb(m, state, buf, len, offAdj, cb, ctxt,
+                                    single, final_point);
+    } else if (mode == STOP_AT_MATCH) {
+        return mcsheng64Exec16_i_sam(m, state, buf, len, offAdj, cb, ctxt,
+                                     single, final_point);
+    } else {
+        assert (mode == NO_MATCHES);
+        return mcsheng64Exec16_i_nm(m, state, buf, len, offAdj, cb, ctxt,
+                                    single, final_point);
+    }
+}
+
+static really_inline
+u32 doNormal64_8(const struct mcsheng64 *m, const u8 **c_inout, const u8 *end, u32 s,
+                 char do_accel, enum MatchMode mode) {
+    const u8 *c = *c_inout;
+    u32 sheng_end = m->sheng_end;
+    u32 accel_limit = m->accel_limit_8;
+    u32 accept_limit = m->accept_limit_8;
+
+    const u32 as = m->alphaShift;
+    const u8 *succ_table = (const u8 *)((const char *)m
+                                        + sizeof(struct mcsheng64));
+    /* Adjust start of succ table so we can index into using state id (rather
+     * than adjust to normal id). As we will not be processing states with low
+     * state ids, we will not be accessing data before the succ table. Note: due
+     * to the size of the sheng tables, the succ_table pointer will still be
+     * inside the engine.*/
+    succ_table -= sheng_end << as;
+
+    assert(s >= sheng_end);
+    while (c < end && s >= sheng_end) {
+        u8 cprime = m->remap[*c];
+        DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx\n", *c,
+                     ourisprint(*c) ? *c : '?', cprime);
+        s = succ_table[(s << as) + cprime];
+
+        DEBUG_PRINTF("s: %u\n", s);
+        c++;
+        if (do_accel) {
+            if (s >= accel_limit) {
+                break;
+            }
+        } else {
+            if (mode != NO_MATCHES && s >= accept_limit) {
+                break;
+            }
+        }
+    }
+    *c_inout = c;
+    return s;
+}
+
+static really_inline
+char mcsheng64Exec8_i(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+                      size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                      char single, const u8 **c_final, enum MatchMode mode) {
+    if (!len) {
+        *c_final = buf;
+        return MO_ALIVE;
+    }
+    u32 s = *state;
+    const u8 *c = buf;
+    const u8 *c_end = buf + len;
+    const u8 sheng_end = m->sheng_end;
+
+    const struct mstate_aux *aux
+        = (const struct mstate_aux *)((const char *)m + m->aux_offset
+                                      - sizeof(struct NFA));
+    u32 accept_limit = m->accept_limit_8;
+
+    u32 cached_accept_id = 0;
+    u32 cached_accept_state = 0;
+
+    DEBUG_PRINTF("accel %hu, accept %u\n", m->accel_limit_8, accept_limit);
+
+    DEBUG_PRINTF("s: %u, len %zu\n", s, len);
+
+    const u8 *min_accel_offset = c;
+    if (!m->has_accel || len < ACCEL_MIN_LEN) {
+        min_accel_offset = c_end;
+        goto without_accel;
+    }
+
+    goto with_accel;
+
+without_accel:
+    do {
+        assert(c < min_accel_offset);
+        if (!s) {
+            goto exit;
+        } else if (s < sheng_end) {
+            s = doSheng64(m, &c, min_accel_offset, c_end, s, 0);
+        } else {
+            s = doNormal64_8(m, &c, min_accel_offset, s, 0, mode);
+            assert(c <= min_accel_offset);
+        }
+
+        if (mode != NO_MATCHES && s >= accept_limit) {
+            if (mode == STOP_AT_MATCH) {
+                DEBUG_PRINTF("match - pausing\n");
+                *state = s;
+                *c_final = c - 1;
+                return MO_MATCHES_PENDING;
+            }
+
+            u64a loc = (c - 1) - buf + offAdj + 1;
+            if (single) {
+                DEBUG_PRINTF("reporting %u\n", m->arb_report);
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                    return MO_DEAD;
+                }
+            } else if (doComplexReport64(cb, ctxt, m, s, loc, 0,
+                                         &cached_accept_state,
+                                         &cached_accept_id)
+                       == MO_HALT_MATCHING) {
+                return MO_DEAD;
+            }
+        }
+
+        assert(c <= c_end); /* sheng is fuzzy for min_accel_offset */
+    } while (c < min_accel_offset);
+
+    if (c == c_end) {
+        goto exit;
+    }
+
+with_accel:
+    do {
+        u32 accel_limit = m->accel_limit_8;
+
+        assert(c < c_end);
+        if (!s) {
+            goto exit;
+        } else if (s < sheng_end) {
+            if (s > m->sheng_accel_limit) {
+                c = run_mcsheng_accel64(m, aux, s, &min_accel_offset, c, c_end);
+                if (c == c_end) {
+                    goto exit;
+                } else {
+                    goto without_accel;
+                }
+            }
+            s = doSheng64(m, &c, c_end, c_end, s, 1);
+        } else {
+            if (s >= accel_limit && aux[s].accel_offset) {
+                c = run_mcsheng_accel64(m, aux, s, &min_accel_offset, c, c_end);
+                if (c == c_end) {
+                    goto exit;
+                } else {
+                    goto without_accel;
+                }
+            }
+            s = doNormal64_8(m, &c, c_end, s, 1, mode);
+        }
+
+        if (mode != NO_MATCHES && s >= accept_limit) {
+            if (mode == STOP_AT_MATCH) {
+                DEBUG_PRINTF("match - pausing\n");
+                *state = s;
+                *c_final = c - 1;
+                return MO_MATCHES_PENDING;
+            }
+
+            u64a loc = (c - 1) - buf + offAdj + 1;
+            if (single) {
+                DEBUG_PRINTF("reporting %u\n", m->arb_report);
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                    return MO_DEAD;
+                }
+            } else if (doComplexReport64(cb, ctxt, m, s, loc, 0,
+                                         &cached_accept_state,
+                                         &cached_accept_id)
+                       == MO_HALT_MATCHING) {
+                return MO_DEAD;
+            }
+        }
+
+        assert(c <= c_end);
+    } while (c < c_end);
+
+exit:
+    *state = s;
+    if (mode == STOP_AT_MATCH) {
+        *c_final = c_end;
+    }
+    return MO_ALIVE;
+}
+
+static never_inline
+char mcsheng64Exec8_i_cb(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+                         size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                         char single, const u8 **final_point) {
+    return mcsheng64Exec8_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                            final_point, CALLBACK_OUTPUT);
+}
+
+static never_inline
+char mcsheng64Exec8_i_sam(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+                          size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                          char single, const u8 **final_point) {
+    return mcsheng64Exec8_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                            final_point, STOP_AT_MATCH);
+}
+
+static never_inline
+char mcsheng64Exec8_i_nm(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+                         size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                         char single, const u8 **final_point) {
+    return mcsheng64Exec8_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                            final_point, NO_MATCHES);
+}
+
+static really_inline
+char mcsheng64Exec8_i_ni(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+                         size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                         char single, const u8 **final_point,
+                         enum MatchMode mode) {
+    if (mode == CALLBACK_OUTPUT) {
+        return mcsheng64Exec8_i_cb(m, state, buf, len, offAdj, cb, ctxt, single,
+                                   final_point);
+    } else if (mode == STOP_AT_MATCH) {
+        return mcsheng64Exec8_i_sam(m, state, buf, len, offAdj, cb, ctxt,
+                                    single, final_point);
+    } else {
+        assert(mode == NO_MATCHES);
+        return mcsheng64Exec8_i_nm(m, state, buf, len, offAdj, cb, ctxt, single,
+                                   final_point);
+    }
+}
+
+static really_inline
+char mcshengCheckEOD64(const struct NFA *nfa, u32 s, u64a offset,
+                       NfaCallback cb, void *ctxt) {
+    const struct mcsheng64 *m = getImplNfa(nfa);
+    const struct mstate_aux *aux = get_aux64(m, s);
+
+    if (!aux->accept_eod) {
+        return MO_CONTINUE_MATCHING;
+    }
+    return doComplexReport64(cb, ctxt, m, s, offset, 1, NULL, NULL);
+}
+
+static really_inline
+char nfaExecMcSheng64_16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
+                             const u8 *hend, NfaCallback cb, void *context,
+                             struct mq *q, char single, s64a end,
+                             enum MatchMode mode) {
+    assert(n->type == MCSHENG_64_NFA_16);
+    const struct mcsheng64 *m = getImplNfa(n);
+    s64a sp;
+
+    assert(ISALIGNED_N(q->state, 2));
+    u32 s = *(u16 *)q->state;
+
+    if (q->report_current) {
+        assert(s);
+        assert(get_aux64(m, s)->accept);
+
+        int rv;
+        if (single) {
+            DEBUG_PRINTF("reporting %u\n", m->arb_report);
+            rv = cb(0, q_cur_offset(q), m->arb_report, context);
+        } else {
+            u32 cached_accept_id = 0;
+            u32 cached_accept_state = 0;
+
+            rv = doComplexReport64(cb, context, m, s, q_cur_offset(q), 0,
+                                   &cached_accept_state, &cached_accept_id);
+        }
+
+        q->report_current = 0;
+
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+    }
+
+    sp = q_cur_loc(q);
+    q->cur++;
+
+    const u8 *cur_buf = sp < 0 ? hend : buffer;
+
+    assert(q->cur);
+    if (mode != NO_MATCHES && q->items[q->cur - 1].location > end) {
+        DEBUG_PRINTF("this is as far as we go\n");
+        q->cur--;
+        q->items[q->cur].type = MQE_START;
+        q->items[q->cur].location = end;
+        *(u16 *)q->state = s;
+        return MO_ALIVE;
+    }
+
+    while (1) {
+        assert(q->cur < q->end);
+        s64a ep = q->items[q->cur].location;
+        if (mode != NO_MATCHES) {
+            ep = MIN(ep, end);
+        }
+
+        assert(ep >= sp);
+
+        s64a local_ep = ep;
+        if (sp < 0) {
+            local_ep = MIN(0, ep);
+        }
+
+        /* do main buffer region */
+        const u8 *final_look;
+        char rv = mcsheng64Exec16_i_ni(m, &s, cur_buf + sp, local_ep - sp,
+                                       offset + sp, cb, context, single,
+                                       &final_look, mode);
+        if (rv == MO_DEAD) {
+            *(u16 *)q->state = 0;
+            return MO_DEAD;
+        }
+        if (mode == STOP_AT_MATCH && rv == MO_MATCHES_PENDING) {
+            DEBUG_PRINTF("this is as far as we go\n");
+            DEBUG_PRINTF("state %u final_look %zd\n", s, final_look - cur_buf);
+
+            assert(q->cur);
+            assert(final_look != cur_buf + local_ep);
+
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = final_look - cur_buf + 1; /* due to
+                                                                   * early -1 */
+            *(u16 *)q->state = s;
+            return MO_MATCHES_PENDING;
+        }
+
+        assert(rv == MO_ALIVE);
+        assert(q->cur);
+        if (mode != NO_MATCHES && q->items[q->cur].location > end) {
+            DEBUG_PRINTF("this is as far as we go\n");
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = end;
+            *(u16 *)q->state = s;
+            return MO_ALIVE;
+        }
+
+        sp = local_ep;
+
+        if (sp == 0) {
+            cur_buf = buffer;
+        }
+
+        if (sp != ep) {
+            continue;
+        }
+
+        switch (q->items[q->cur].type) {
+        case MQE_TOP:
+            assert(sp + offset || !s);
+            if (sp + offset == 0) {
+                s = m->start_anchored;
+                break;
+            }
+            s = mcshengEnableStarts64(m, s);
+            break;
+        case MQE_END:
+            *(u16 *)q->state = s;
+            q->cur++;
+            return s ? MO_ALIVE : MO_DEAD;
+        default:
+            assert(!"invalid queue event");
+        }
+
+        q->cur++;
+    }
+}
+
+static really_inline
+char nfaExecMcSheng64_8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
+                            const u8 *hend, NfaCallback cb, void *context,
+                            struct mq *q, char single, s64a end,
+                            enum MatchMode mode) {
+    assert(n->type == MCSHENG_64_NFA_8);
+    const struct mcsheng64 *m = getImplNfa(n);
+    s64a sp;
+
+    u32 s = *(u8 *)q->state;
+
+    if (q->report_current) {
+        assert(s);
+        assert(s >= m->accept_limit_8);
+
+        int rv;
+        if (single) {
+            DEBUG_PRINTF("reporting %u\n", m->arb_report);
+
+            rv = cb(0, q_cur_offset(q), m->arb_report, context);
+        } else {
+            u32 cached_accept_id = 0;
+            u32 cached_accept_state = 0;
+
+            rv = doComplexReport64(cb, context, m, s, q_cur_offset(q), 0,
+                                   &cached_accept_state, &cached_accept_id);
+        }
+
+        q->report_current = 0;
+
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+    }
+
+    sp = q_cur_loc(q);
+    q->cur++;
+
+    const u8 *cur_buf = sp < 0 ? hend : buffer;
+
+    if (mode != NO_MATCHES && q->items[q->cur - 1].location > end) {
+        DEBUG_PRINTF("this is as far as we go\n");
+        q->cur--;
+        q->items[q->cur].type = MQE_START;
+        q->items[q->cur].location = end;
+        *(u8 *)q->state = s;
+        return MO_ALIVE;
+    }
+
+    while (1) {
+        DEBUG_PRINTF("%s @ %llu\n", q->items[q->cur].type == MQE_TOP ? "TOP" :
+                     q->items[q->cur].type == MQE_END ? "END" : "???",
+                     q->items[q->cur].location + offset);
+        assert(q->cur < q->end);
+        s64a ep = q->items[q->cur].location;
+        if (mode != NO_MATCHES) {
+            ep = MIN(ep, end);
+        }
+
+        assert(ep >= sp);
+
+        s64a local_ep = ep;
+        if (sp < 0) {
+            local_ep = MIN(0, ep);
+        }
+
+        const u8 *final_look;
+        char rv = mcsheng64Exec8_i_ni(m, &s, cur_buf + sp, local_ep - sp,
+                                      offset + sp, cb, context, single,
+                                      &final_look, mode);
+        if (rv == MO_HALT_MATCHING) {
+            *(u8 *)q->state = 0;
+            return MO_DEAD;
+        }
+        if (mode == STOP_AT_MATCH && rv == MO_MATCHES_PENDING) {
+            DEBUG_PRINTF("this is as far as we go\n");
+            DEBUG_PRINTF("state %u final_look %zd\n", s, final_look - cur_buf);
+
+            assert(q->cur);
+            assert(final_look != cur_buf + local_ep);
+
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = final_look - cur_buf + 1; /* due to
+                                                                   * early -1 */
+            *(u8 *)q->state = s;
+            return MO_MATCHES_PENDING;
+        }
+
+        assert(rv == MO_ALIVE);
+        assert(q->cur);
+        if (mode != NO_MATCHES && q->items[q->cur].location > end) {
+            DEBUG_PRINTF("this is as far as we go\n");
+            assert(q->cur);
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = end;
+            *(u8 *)q->state = s;
+            return MO_ALIVE;
+        }
+
+        sp = local_ep;
+
+        if (sp == 0) {
+            cur_buf = buffer;
+        }
+
+        if (sp != ep) {
+            continue;
+        }
+
+        switch (q->items[q->cur].type) {
+        case MQE_TOP:
+            assert(sp + offset || !s);
+            if (sp + offset == 0) {
+                s = (u8)m->start_anchored;
+                break;
+            }
+            s = mcshengEnableStarts64(m, s);
+            break;
+        case MQE_END:
+            *(u8 *)q->state = s;
+            q->cur++;
+            return s ? MO_ALIVE : MO_DEAD;
+        default:
+            assert(!"invalid queue event");
+        }
+
+        q->cur++;
+    }
+}
+
+char nfaExecMcSheng64_8_Q(const struct NFA *n, struct mq *q, s64a end) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_64_NFA_8);
+    const struct mcsheng64 *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    return nfaExecMcSheng64_8_Q2i(n, offset, buffer, hend, cb, context, q,
+                                  m->flags & MCSHENG_FLAG_SINGLE, end,
+                                  CALLBACK_OUTPUT);
+}
+
+char nfaExecMcSheng64_16_Q(const struct NFA *n, struct mq *q, s64a end) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_64_NFA_16);
+    const struct mcsheng64 *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    return nfaExecMcSheng64_16_Q2i(n, offset, buffer, hend, cb, context, q,
+                                   m->flags & MCSHENG_FLAG_SINGLE, end,
+                                   CALLBACK_OUTPUT);
+}
+
+char nfaExecMcSheng64_8_reportCurrent(const struct NFA *n, struct mq *q) {
+    const struct mcsheng64 *m = getImplNfa(n);
+    NfaCallback cb = q->cb;
+    void *ctxt = q->context;
+    u32 s = *(u8 *)q->state;
+    u8 single = m->flags & MCSHENG_FLAG_SINGLE;
+    u64a offset = q_cur_offset(q);
+    assert(q_cur_type(q) == MQE_START);
+    assert(s);
+
+    if (s >= m->accept_limit_8) {
+        if (single) {
+            DEBUG_PRINTF("reporting %u\n", m->arb_report);
+            cb(0, offset, m->arb_report, ctxt);
+        } else {
+            u32 cached_accept_id = 0;
+            u32 cached_accept_state = 0;
+
+            doComplexReport64(cb, ctxt, m, s, offset, 0, &cached_accept_state,
+                              &cached_accept_id);
+        }
+    }
+
+    return 0;
+}
+
+char nfaExecMcSheng64_16_reportCurrent(const struct NFA *n, struct mq *q) {
+    const struct mcsheng64 *m = getImplNfa(n);
+    NfaCallback cb = q->cb;
+    void *ctxt = q->context;
+    u32 s = *(u16 *)q->state;
+    const struct mstate_aux *aux = get_aux64(m, s);
+    u8 single = m->flags & MCSHENG_FLAG_SINGLE;
+    u64a offset = q_cur_offset(q);
+    assert(q_cur_type(q) == MQE_START);
+    DEBUG_PRINTF("state %u\n", s);
+    assert(s);
+
+    if (aux->accept) {
+        if (single) {
+            DEBUG_PRINTF("reporting %u\n", m->arb_report);
+            cb(0, offset, m->arb_report, ctxt);
+        } else {
+            u32 cached_accept_id = 0;
+            u32 cached_accept_state = 0;
+
+            doComplexReport64(cb, ctxt, m, s, offset, 0, &cached_accept_state,
+                              &cached_accept_id);
+        }
+    }
+
+    return 0;
+}
+
+static
+char mcshengHasAccept64(const struct mcsheng64 *m, const struct mstate_aux *aux,
+                        ReportID report) {
+    assert(m && aux);
+
+    if (!aux->accept) {
+        return 0;
+    }
+
+    const struct report_list *rl = (const struct report_list *)
+            ((const char *)m + aux->accept - sizeof(struct NFA));
+    assert(ISALIGNED_N(rl, 4));
+
+    DEBUG_PRINTF("report list has %u entries\n", rl->count);
+
+    for (u32 i = 0; i < rl->count; i++) {
+        if (rl->report[i] == report) {
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
+char nfaExecMcSheng64_8_inAccept(const struct NFA *n, ReportID report,
+                                 struct mq *q) {
+    assert(n && q);
+
+    const struct mcsheng64 *m = getImplNfa(n);
+    u8 s = *(u8 *)q->state;
+    DEBUG_PRINTF("checking accepts for %hhu\n", s);
+
+    return mcshengHasAccept64(m, get_aux64(m, s), report);
+}
+
+char nfaExecMcSheng64_8_inAnyAccept(const struct NFA *n, struct mq *q) {
+    assert(n && q);
+
+    const struct mcsheng64 *m = getImplNfa(n);
+    u8 s = *(u8 *)q->state;
+    DEBUG_PRINTF("checking accepts for %hhu\n", s);
+
+    return !!get_aux64(m, s)->accept;
+}
+
+char nfaExecMcSheng64_16_inAccept(const struct NFA *n, ReportID report,
+                                  struct mq *q) {
+    assert(n && q);
+
+    const struct mcsheng64 *m = getImplNfa(n);
+    u16 s = *(u16 *)q->state;
+    DEBUG_PRINTF("checking accepts for %hu\n", s);
+
+    return mcshengHasAccept64(m, get_aux64(m, s), report);
+}
+
+char nfaExecMcSheng64_16_inAnyAccept(const struct NFA *n, struct mq *q) {
+    assert(n && q);
+
+    const struct mcsheng64 *m = getImplNfa(n);
+    u16 s = *(u16 *)q->state;
+    DEBUG_PRINTF("checking accepts for %hu\n", s);
+
+    return !!get_aux64(m, s)->accept;
+}
+
+char nfaExecMcSheng64_8_Q2(const struct NFA *n, struct mq *q, s64a end) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_64_NFA_8);
+    const struct mcsheng64 *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    return nfaExecMcSheng64_8_Q2i(n, offset, buffer, hend, cb, context, q,
+                                  m->flags & MCSHENG_FLAG_SINGLE, end,
+                                  STOP_AT_MATCH);
+}
+
+char nfaExecMcSheng64_16_Q2(const struct NFA *n, struct mq *q, s64a end) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_64_NFA_16);
+    const struct mcsheng64 *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    return nfaExecMcSheng64_16_Q2i(n, offset, buffer, hend, cb, context, q,
+                                   m->flags & MCSHENG_FLAG_SINGLE, end,
+                                   STOP_AT_MATCH);
+}
+
+char nfaExecMcSheng64_8_QR(const struct NFA *n, struct mq *q, ReportID report) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_64_NFA_8);
+    const struct mcsheng64 *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    char rv = nfaExecMcSheng64_8_Q2i(n, offset, buffer, hend, cb, context, q,
+                                     m->flags & MCSHENG_FLAG_SINGLE,
+                                     0 /* end */, NO_MATCHES);
+    if (rv && nfaExecMcSheng64_8_inAccept(n, report, q)) {
+        return MO_MATCHES_PENDING;
+    } else {
+        return rv;
+    }
+}
+
+char nfaExecMcSheng64_16_QR(const struct NFA *n, struct mq *q, ReportID report) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_64_NFA_16);
+    const struct mcsheng64 *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    char rv = nfaExecMcSheng64_16_Q2i(n, offset, buffer, hend, cb, context, q,
+                                      m->flags & MCSHENG_FLAG_SINGLE,
+                                      0 /* end */, NO_MATCHES);
+
+    if (rv && nfaExecMcSheng64_16_inAccept(n, report, q)) {
+        return MO_MATCHES_PENDING;
+    } else {
+        return rv;
+    }
+}
+
+char nfaExecMcSheng64_8_initCompressedState(const struct NFA *nfa, u64a offset,
+                                            void *state, UNUSED u8 key) {
+    const struct mcsheng64 *m = getImplNfa(nfa);
+    u8 s = offset ? m->start_floating : m->start_anchored;
+    if (s) {
+        *(u8 *)state = s;
+        return 1;
+    }
+    return 0;
+}
+
+char nfaExecMcSheng64_16_initCompressedState(const struct NFA *nfa, u64a offset,
+                                             void *state, UNUSED u8 key) {
+    const struct mcsheng64 *m = getImplNfa(nfa);
+    u16 s = offset ? m->start_floating : m->start_anchored;
+    if (s) {
+        unaligned_store_u16(state, s);
+        return 1;
+    }
+    return 0;
+}
+
+char nfaExecMcSheng64_8_testEOD(const struct NFA *nfa, const char *state,
+                                UNUSED const char *streamState, u64a offset,
+                                NfaCallback callback, void *context) {
+    return mcshengCheckEOD64(nfa, *(const u8 *)state, offset, callback,
+                             context);
+}
+
+char nfaExecMcSheng64_16_testEOD(const struct NFA *nfa, const char *state,
+                                 UNUSED const char *streamState, u64a offset,
+                                 NfaCallback callback, void *context) {
+    assert(ISALIGNED_N(state, 2));
+    return mcshengCheckEOD64(nfa, *(const u16 *)state, offset, callback,
+                             context);
+}
+
+char nfaExecMcSheng64_8_queueInitState(UNUSED const struct NFA *nfa, struct mq *q) {
+    assert(nfa->scratchStateSize == 1);
+    *(u8 *)q->state = 0;
+    return 0;
+}
+
+char nfaExecMcSheng64_16_queueInitState(UNUSED const struct NFA *nfa, struct mq *q) {
+    assert(nfa->scratchStateSize == 2);
+    assert(ISALIGNED_N(q->state, 2));
+    *(u16 *)q->state = 0;
+    return 0;
+}
+
+char nfaExecMcSheng64_8_queueCompressState(UNUSED const struct NFA *nfa,
+                                           const struct mq *q, UNUSED s64a loc) {
+    void *dest = q->streamState;
+    const void *src = q->state;
+    assert(nfa->scratchStateSize == 1);
+    assert(nfa->streamStateSize == 1);
+    *(u8 *)dest = *(const u8 *)src;
+    return 0;
+}
+
+char nfaExecMcSheng64_8_expandState(UNUSED const struct NFA *nfa, void *dest,
+                                    const void *src, UNUSED u64a offset,
+                                    UNUSED u8 key) {
+    assert(nfa->scratchStateSize == 1);
+    assert(nfa->streamStateSize == 1);
+    *(u8 *)dest = *(const u8 *)src;
+    return 0;
+}
+
+char nfaExecMcSheng64_16_queueCompressState(UNUSED const struct NFA *nfa,
+                                            const struct mq *q,
+                                            UNUSED s64a loc) {
+    void *dest = q->streamState;
+    const void *src = q->state;
+    assert(nfa->scratchStateSize == 2);
+    assert(nfa->streamStateSize == 2);
+    assert(ISALIGNED_N(src, 2));
+    unaligned_store_u16(dest, *(const u16 *)(src));
+    return 0;
+}
+
+char nfaExecMcSheng64_16_expandState(UNUSED const struct NFA *nfa, void *dest,
+                                     const void *src, UNUSED u64a offset,
+                                     UNUSED u8 key) {
+    assert(nfa->scratchStateSize == 2);
+    assert(nfa->streamStateSize == 2);
+    assert(ISALIGNED_N(dest, 2));
+    *(u16 *)dest = unaligned_load_u16(src);
+    return 0;
+}
+#endif
diff --git a/regex/nfa/mcsheng.h b/regex/nfa/mcsheng.h
new file mode 100644
index 000000000..0329e1212
--- /dev/null
+++ b/regex/nfa/mcsheng.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2016-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef MCSHENG_H
+#define MCSHENG_H
+
+#include "callback.h"
+#include "ue2common.h"
+
+struct mq;
+struct NFA;
+
+/* 8-bit Sheng-McClellan hybrid */
+
+char nfaExecMcSheng8_testEOD(const struct NFA *nfa, const char *state,
+                             const char *streamState, u64a offset,
+                             NfaCallback callback, void *context);
+char nfaExecMcSheng8_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecMcSheng8_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecMcSheng8_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecMcSheng8_reportCurrent(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng8_inAccept(const struct NFA *n, ReportID report,
+                              struct mq *q);
+char nfaExecMcSheng8_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng8_queueInitState(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng8_initCompressedState(const struct NFA *n, u64a offset,
+                                         void *state, u8 key);
+char nfaExecMcSheng8_queueCompressState(const struct NFA *nfa,
+                                        const struct mq *q, s64a loc);
+char nfaExecMcSheng8_expandState(const struct NFA *nfa, void *dest,
+                                 const void *src, u64a offset, u8 key);
+
+#define nfaExecMcSheng8_B_Reverse NFA_API_NO_IMPL
+#define nfaExecMcSheng8_zombie_status NFA_API_ZOMBIE_NO_IMPL
+
+/* 16-bit Sheng-McClellan hybrid */
+
+char nfaExecMcSheng16_testEOD(const struct NFA *nfa, const char *state,
+                              const char *streamState, u64a offset,
+                              NfaCallback callback, void *context);
+char nfaExecMcSheng16_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecMcSheng16_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecMcSheng16_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecMcSheng16_reportCurrent(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng16_inAccept(const struct NFA *n, ReportID report,
+                               struct mq *q);
+char nfaExecMcSheng16_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng16_queueInitState(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng16_initCompressedState(const struct NFA *n, u64a offset,
+                                          void *state, u8 key);
+char nfaExecMcSheng16_queueCompressState(const struct NFA *nfa,
+                                         const struct mq *q, s64a loc);
+char nfaExecMcSheng16_expandState(const struct NFA *nfa, void *dest,
+                                  const void *src, u64a offset, u8 key);
+
+#define nfaExecMcSheng16_B_Reverse NFA_API_NO_IMPL
+#define nfaExecMcSheng16_zombie_status NFA_API_ZOMBIE_NO_IMPL
+#if defined(HAVE_AVX512VBMI)
+/* 64-8 bit Sheng-McClellan hybrid  */
+char nfaExecMcSheng64_8_testEOD(const struct NFA *nfa, const char *state,
+                                const char *streamState, u64a offset,
+                                NfaCallback callback, void *context);
+char nfaExecMcSheng64_8_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecMcSheng64_8_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecMcSheng64_8_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecMcSheng64_8_reportCurrent(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng64_8_inAccept(const struct NFA *n, ReportID report,
+                                 struct mq *q);
+char nfaExecMcSheng64_8_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng64_8_queueInitState(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng64_8_initCompressedState(const struct NFA *n, u64a offset,
+                                            void *state, u8 key);
+char nfaExecMcSheng64_8_queueCompressState(const struct NFA *nfa,
+                                           const struct mq *q, s64a loc);
+char nfaExecMcSheng64_8_expandState(const struct NFA *nfa, void *dest,
+                                    const void *src, u64a offset, u8 key);
+
+#define nfaExecMcSheng64_8_B_Reverse NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_zombie_status NFA_API_ZOMBIE_NO_IMPL
+
+/* 64-16 bit Sheng-McClellan hybrid  */
+char nfaExecMcSheng64_16_testEOD(const struct NFA *nfa, const char *state,
+                                 const char *streamState, u64a offset,
+                                 NfaCallback callback, void *context);
+char nfaExecMcSheng64_16_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecMcSheng64_16_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecMcSheng64_16_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecMcSheng64_16_reportCurrent(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng64_16_inAccept(const struct NFA *n, ReportID report,
+                                  struct mq *q);
+char nfaExecMcSheng64_16_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng64_16_queueInitState(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng64_16_initCompressedState(const struct NFA *n, u64a offset,
+                                             void *state, u8 key);
+char nfaExecMcSheng64_16_queueCompressState(const struct NFA *nfa,
+                                            const struct mq *q, s64a loc);
+char nfaExecMcSheng64_16_expandState(const struct NFA *nfa, void *dest,
+                                     const void *src, u64a offset, u8 key);
+#define nfaExecMcSheng64_16_B_Reverse NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_zombie_status NFA_API_ZOMBIE_NO_IMPL
+#else // !HAVE_AVX512VBMI
+#define nfaExecMcSheng64_8_B_Reverse NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_zombie_status NFA_API_ZOMBIE_NO_IMPL
+#define nfaExecMcSheng64_8_Q NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_Q2 NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_QR NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_inAccept NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_inAnyAccept NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_queueInitState NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_queueCompressState NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_expandState NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_initCompressedState NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_testEOD NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_reportCurrent NFA_API_NO_IMPL
+
+#define nfaExecMcSheng64_16_B_Reverse NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_zombie_status NFA_API_ZOMBIE_NO_IMPL
+#define nfaExecMcSheng64_16_Q NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_Q2 NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_QR NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_inAccept NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_inAnyAccept NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_queueInitState NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_queueCompressState NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_expandState NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_initCompressedState NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_testEOD NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_reportCurrent NFA_API_NO_IMPL
+
+#endif //end of HAVE_AVX512VBMI
+
+#endif
diff --git a/regex/nfa/mcsheng_data.c b/regex/nfa/mcsheng_data.c
new file mode 100644
index 000000000..0701b4b31
--- /dev/null
+++ b/regex/nfa/mcsheng_data.c
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "mcsheng_internal.h"
+
+/* This table is in a separate translation unit from mcsheng.c as we want to
+ * prevent the compiler from seeing these constants. We have the load resources
+ * free at runtime to load the masks with no problems. */
+const u64a mcsheng_pext_mask[8] = {
+    0, /* dummy */
+    0x000000000000ff0f,
+    0x0000000000ff000f,
+    0x00000000ff00000f,
+    0x000000ff0000000f,
+    0x0000ff000000000f,
+    0x00ff00000000000f,
+    0xff0000000000000f,
+};
+#if defined(HAVE_AVX512VBMI)
+const u64a mcsheng64_pext_mask[8] = {
+    0, /* dummy */
+    0x000000000000ff3f,
+    0x0000000000ff003f,
+    0x00000000ff00003f,
+    0x000000ff0000003f,
+    0x0000ff000000003f,
+    0x00ff00000000003f,
+    0xff0000000000003f,
+};
+#endif
diff --git a/regex/nfa/mcsheng_internal.h b/regex/nfa/mcsheng_internal.h
new file mode 100644
index 000000000..d98557462
--- /dev/null
+++ b/regex/nfa/mcsheng_internal.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2016-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef MCSHENG_INTERNAL_H
+#define MCSHENG_INTERNAL_H
+
+#include "nfa_internal.h"
+#include "ue2common.h"
+#include "util/simd_types.h"
+
+#define ACCEPT_FLAG 0x8000
+#define ACCEL_FLAG  0x4000
+#define STATE_MASK  0x3fff
+
+#define SHERMAN_STATE 1
+
+#define SHERMAN_TYPE_OFFSET            0
+#define SHERMAN_FIXED_SIZE            32
+
+#define SHERMAN_LEN_OFFSET             1
+#define SHERMAN_DADDY_OFFSET           2
+#define SHERMAN_CHARS_OFFSET           4
+#define SHERMAN_STATES_OFFSET(sso_len) (4 + (sso_len))
+
+struct report_list {
+    u32 count;
+    ReportID report[];
+};
+
+struct mstate_aux {
+    u32 accept;
+    u32 accept_eod;
+    u16 top;
+    u32 accel_offset; /* relative to start of struct mcsheng; 0 if no accel */
+};
+
+#define MCSHENG_FLAG_SINGLE 1  /**< we raise only single accept id */
+
+struct mcsheng {
+    u16 state_count; /**< total number of states */
+    u32 length; /**< length of dfa in bytes */
+    u16 start_anchored; /**< anchored start state */
+    u16 start_floating; /**< floating start state */
+    u32 aux_offset; /**< offset of the aux structures relative to the start of
+                     *  the nfa structure */
+    u32 sherman_offset; /**< offset of array of sherman state offsets the
+                         * state_info structures relative to the start of the
+                         * nfa structure */
+    u32 sherman_end; /**< offset of the end of the state_info structures
+                      * relative to the start of the nfa structure */
+    u16 sheng_end; /**< first non-sheng state */
+    u16 sheng_accel_limit; /**< first sheng accel state. state given in terms of
+                            * internal sheng ids */
+    u16 accel_limit_8; /**< 8 bit, lowest accelerable state */
+    u16 accept_limit_8; /**< 8 bit, lowest accept state */
+    u16 sherman_limit; /**< lowest sherman state */
+    u8  alphaShift;
+    u8  flags;
+    u8  has_accel; /**< 1 iff there are any accel plans */
+    u8  remap[256]; /**< remaps characters to a smaller alphabet */
+    ReportID arb_report; /**< one of the accepts that this dfa may raise */
+    u32 accel_offset; /**< offset of accel structures from start of McClellan */
+    m128 sheng_masks[N_CHARS];
+};
+
+/* pext masks for the runtime to access appropriately copies of bytes 1..7
+ * representing the data from a u64a. */
+extern const u64a mcsheng_pext_mask[8];
+
+struct mcsheng64 {
+    u16 state_count; /**< total number of states */
+    u32 length; /**< length of dfa in bytes */
+    u16 start_anchored; /**< anchored start state */
+    u16 start_floating; /**< floating start state */
+    u32 aux_offset; /**< offset of the aux structures relative to the start of
+                     *  the nfa structure */
+    u32 sherman_offset; /**< offset of array of sherman state offsets the
+                         * state_info structures relative to the start of the
+                         * nfa structure */
+    u32 sherman_end; /**< offset of the end of the state_info structures
+                      * relative to the start of the nfa structure */
+    u16 sheng_end; /**< first non-sheng state */
+    u16 sheng_accel_limit; /**< first sheng accel state. state given in terms of
+                            * internal sheng ids */
+    u16 accel_limit_8; /**< 8 bit, lowest accelerable state */
+    u16 accept_limit_8; /**< 8 bit, lowest accept state */
+    u16 sherman_limit; /**< lowest sherman state */
+    u8  alphaShift;
+    u8  flags;
+    u8  has_accel; /**< 1 iff there are any accel plans */
+    u8  remap[256]; /**< remaps characters to a smaller alphabet */
+    ReportID arb_report; /**< one of the accepts that this dfa may raise */
+    u32 accel_offset; /**< offset of accel structures from start of McClellan */
+    m512 sheng_succ_masks[N_CHARS];
+};
+
+extern const u64a mcsheng64_pext_mask[8];
+
+#endif
diff --git a/regex/nfa/mpv.c b/regex/nfa/mpv.c
new file mode 100644
index 000000000..d03009b20
--- /dev/null
+++ b/regex/nfa/mpv.c
@@ -0,0 +1,1100 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "mpv.h"
+
+#include "mpv_internal.h"
+#include "nfa_api.h"
+#include "nfa_api_queue.h"
+#include "nfa_internal.h"
+#include "shufti.h"
+#include "truffle.h"
+#include "ue2common.h"
+#include "vermicelli.h"
+#include "vermicelli_run.h"
+#include "util/multibit.h"
+#include "util/partial_store.h"
+#include "util/simd_utils.h"
+#include "util/unaligned.h"
+
+#ifndef __KERNEL__
+#include <string.h>
+#else
+#include <linux/string.h>
+#endif
+
+#define MIN_SKIP_REPEAT 32
+
+typedef struct mpv_pq_item PQ_T;
+#define PQ_COMP(pqc_items, a, b) \
+    ((pqc_items)[a].trigger_loc < (pqc_items)[b].trigger_loc)
+#define PQ_COMP_B(pqc_items, a, b_fixed) \
+    ((pqc_items)[a].trigger_loc < (b_fixed).trigger_loc)
+
+#include "util/pqueue.h"
+
+static really_inline
+u64a *get_counter_n(struct mpv_decomp_state *s,
+                    const struct mpv *m, u32 n) {
+    return (u64a *)((char *)s + get_counter_info(m)[n].counter_offset);
+}
+
+static really_inline
+u64a *get_counter_for_kilo(struct mpv_decomp_state *s,
+                           const struct mpv_kilopuff *kp) {
+    return (u64a *)((char *)s + kp->counter_offset);
+}
+
+static really_inline
+u64a get_counter_value_for_kilo(struct mpv_decomp_state *s,
+                                const struct mpv_kilopuff *kp) {
+    return *get_counter_for_kilo(s, kp) + s->counter_adj;
+}
+
+static really_inline
+const u64a *get_counter_for_kilo_c(const struct mpv_decomp_state *s,
+                             const struct mpv_kilopuff *kp) {
+    return (const u64a *)((const char *)s + kp->counter_offset);
+}
+
+
+static never_inline
+void normalize_counters(struct mpv_decomp_state *dstate, const struct mpv *m) {
+    u64a adj = dstate->counter_adj;
+    u64a *counters = get_counter_n(dstate, m, 0);
+
+    if (!adj) {
+        return;
+    }
+
+    for (u32 i = 0; i < m->counter_count; i++) {
+        /* update all counters - alive or dead */
+        counters[i] += adj;
+        DEBUG_PRINTF("counter %u: %llu\n", i, counters[i]);
+    }
+
+    dstate->counter_adj = 0;
+}
+
+static really_inline
+char processReports(const struct mpv *m, u8 *reporters,
+                    const struct mpv_decomp_state *dstate, u64a counter_adj,
+                    u64a report_offset, NfaCallback cb, void *ctxt,
+                    ReportID *rl, u32 *rl_count_out) {
+    DEBUG_PRINTF("reporting at offset %llu\n", report_offset);
+    const struct mpv_kilopuff *kp = (const void *)(m + 1);
+    u32 rl_count = 0;
+
+    for (u32 i = mmbit_iterate(reporters, m->kilo_count, MMB_INVALID);
+         i != MMB_INVALID; i = mmbit_iterate(reporters, m->kilo_count, i)) {
+        const struct mpv_puffette *curr = dstate->active[i].curr;
+        u64a curr_counter_val = *get_counter_for_kilo_c(dstate, &kp[i])
+                              + counter_adj;
+        DEBUG_PRINTF("kilo %u, underlying counter: %llu current: %llu\n", i,
+                     *get_counter_for_kilo_c(dstate, &kp[i]), curr_counter_val);
+        assert(curr_counter_val != MPV_DEAD_VALUE); /* counter_adj should take
+                                                     * care if underlying value
+                                                     * is -1 */
+        char did_stuff = 0;
+
+        while (curr->report != INVALID_REPORT) {
+            assert(curr_counter_val >= curr->repeats);
+            if (curr->unbounded || curr_counter_val == curr->repeats) {
+                DEBUG_PRINTF("report %u at %llu\n", curr->report,
+                              report_offset);
+
+                if (curr->unbounded && !curr->simple_exhaust) {
+                    assert(rl_count < m->puffette_count);
+                    *rl = curr->report;
+                    ++rl;
+                    rl_count++;
+                }
+
+                if (cb(0, report_offset, curr->report, ctxt) ==
+                    MO_HALT_MATCHING) {
+                    DEBUG_PRINTF("bailing\n");
+                    return MO_HALT_MATCHING;
+                }
+                did_stuff = 1;
+            }
+
+            curr--;
+        }
+
+        if (!did_stuff) {
+            mmbit_unset(reporters, m->kilo_count, i);
+        }
+    }
+
+    *rl_count_out = rl_count;
+    return MO_CONTINUE_MATCHING;
+}
+
+static
+ReportID *get_report_list(const struct mpv *m, struct mpv_decomp_state *s) {
+    return (ReportID *)((char *)s + m->report_list_offset);
+}
+
+static really_inline
+char processReportsForRange(const struct mpv *m, u8 *reporters,
+                            struct mpv_decomp_state *dstate, u64a first_offset,
+                            size_t length, NfaCallback cb, void *ctxt) {
+    if (!length) {
+        return MO_CONTINUE_MATCHING;
+    }
+
+    u64a counter_adj = dstate->counter_adj;
+    u32 rl_count = 0;
+    ReportID *rl = get_report_list(m, dstate);
+    char rv = processReports(m, reporters, dstate, 1 + counter_adj,
+                             first_offset + 1, cb, ctxt, rl, &rl_count);
+    if (rv != MO_CONTINUE_MATCHING) {
+        DEBUG_PRINTF("bailing\n");
+        return rv;
+    }
+    if (!rl_count) {
+        return MO_CONTINUE_MATCHING;
+    }
+
+    DEBUG_PRINTF("length=%zu, rl_count=%u\n", length, rl_count);
+
+    for (size_t i = 2; i <= length; i++) {
+        for (u32 j = 0; j < rl_count; j++) {
+            if (cb(0, first_offset + i, rl[j], ctxt) == MO_HALT_MATCHING) {
+                DEBUG_PRINTF("bailing\n");
+                return MO_HALT_MATCHING;
+            }
+        }
+    }
+
+    return MO_CONTINUE_MATCHING;
+}
+
+/* returns last puffette that we have satisfied */
+static
+const struct mpv_puffette *get_curr_puff(const struct mpv *m,
+                                         const struct mpv_kilopuff *kp,
+                                         struct mpv_decomp_state *dstate) {
+    u64a counter = *get_counter_for_kilo(dstate, kp);
+    assert(counter != MPV_DEAD_VALUE);
+
+    const struct mpv_puffette *p = get_puff_array(m, kp);
+    DEBUG_PRINTF("looking for current puffette (counter = %llu)\n", counter);
+    DEBUG_PRINTF("next: (%u, %u)\n", p->repeats, p->report);
+    while (counter + 1 >= p->repeats && p->report != INVALID_REPORT) {
+        DEBUG_PRINTF("advancing\n");
+        ++p;
+        DEBUG_PRINTF("next: (%u, %u)\n", p->repeats, p->report);
+    }
+
+    return p - 1;
+}
+
+static
+const struct mpv_puffette *get_init_puff(const struct mpv *m,
+                                         const struct mpv_kilopuff *kp) {
+    const struct mpv_puffette *p = get_puff_array(m, kp);
+    while (p->repeats == 1) {
+        ++p;
+    }
+    return p - 1;
+}
+
+
+/* returns the last puffette whose repeats have been satisfied */
+static really_inline
+const struct mpv_puffette *update_curr_puff(const struct mpv *m, u8 *reporters,
+                                            u64a counter,
+                                            const struct mpv_puffette *in,
+                                            u32 kilo_index) {
+    assert(counter != MPV_DEAD_VALUE);
+
+    const struct mpv_puffette *p = in;
+    DEBUG_PRINTF("looking for current puffette (counter = %llu)\n", counter);
+    DEBUG_PRINTF("curr: (%u, %u)\n", p->repeats, p->report);
+    while (counter + 1 >= p[1].repeats && p[1].report != INVALID_REPORT) {
+        DEBUG_PRINTF("advancing\n");
+        ++p;
+        DEBUG_PRINTF("curr: (%u, %u)\n", p->repeats, p->report);
+    }
+
+    if (p != in) {
+        mmbit_set(reporters, m->kilo_count, kilo_index);
+    }
+
+    return p;
+}
+
+static really_inline
+size_t limitByReach(const struct mpv_kilopuff *kp, const u8 *buf,
+                    size_t length) {
+    if (kp->type == MPV_VERM) {
+        return vermicelliExec(kp->u.verm.c, 0, buf, buf + length) - buf;
+    } else if (kp->type == MPV_SHUFTI) {
+        m128 mask_lo = kp->u.shuf.mask_lo;
+        m128 mask_hi = kp->u.shuf.mask_hi;
+        return shuftiExec(mask_lo, mask_hi, buf, buf + length) - buf;
+    } else if (kp->type == MPV_TRUFFLE) {
+        return truffleExec(kp->u.truffle.mask1, kp->u.truffle.mask2, buf, buf + length) - buf;
+    } else if (kp->type == MPV_NVERM) {
+        return nvermicelliExec(kp->u.verm.c, 0, buf, buf + length) - buf;
+    }
+
+    assert(kp->type == MPV_DOT);
+    return length;
+}
+
+static never_inline
+void fillLimits(const struct mpv *m, u8 *active, u8 *reporters,
+                struct mpv_decomp_state *dstate, struct mpv_pq_item *pq,
+                const u8 *buf, size_t length) {
+    DEBUG_PRINTF("filling limits %zu\n", length);
+    assert(!dstate->pq_size);
+
+    if (!length) {
+        DEBUG_PRINTF("0 length\n");
+        return;
+    }
+
+    const struct mpv_kilopuff *kp = (const void *)(m + 1);
+
+    for (u32 i = mmbit_iterate(active, m->kilo_count, MMB_INVALID);
+         i != MMB_INVALID; i = mmbit_iterate(active, m->kilo_count, i)) {
+        dstate->active[i].curr = get_curr_puff(m, &kp[i], dstate);
+        if (dstate->active[i].curr->report != INVALID_REPORT) {
+            /* this kilo puff may fire reports */
+            mmbit_set(reporters, m->kilo_count, i);
+        }
+
+        u64a lim = limitByReach(&kp[i], buf, length);
+        DEBUG_PRINTF("lim %llu/%zu\n", lim, length);
+
+        if (kp[i].dead_point != MPV_DEAD_VALUE) {
+            assert(!kp[i].auto_restart);
+            u64a counter = get_counter_value_for_kilo(dstate, &kp[i]);
+            u64a dp_trigger = kp[i].dead_point - counter;
+            if (dp_trigger < lim) {
+                DEBUG_PRINTF("dead point trigger %llu\n", dp_trigger);
+                lim = dp_trigger;
+            }
+        }
+
+        if (kp[i].auto_restart && !lim) {
+            *get_counter_for_kilo(dstate, &kp[i]) = MPV_DEAD_VALUE;
+            mmbit_unset(reporters, m->kilo_count, i);
+            /* the counter value will cause the nex_trigger calculation below to
+             * adjust correctly */
+            if (length == 1) {
+                dstate->active[i].limit = 0;
+                continue;
+            }
+
+            lim = limitByReach(&kp[i], buf + 1, length - 1) + 1;
+
+
+            /* restart active counters */
+            dstate->active[i].curr = get_init_puff(m, &kp[i]);
+            assert(dstate->active[i].curr[0].report == INVALID_REPORT);
+
+            DEBUG_PRINTF("lim now %llu/%zu\n", lim, length);
+        }
+
+        dstate->active[i].limit = lim;
+        if (!lim) {
+            mmbit_unset(active, m->kilo_count, i);
+            mmbit_unset(reporters, m->kilo_count, i);
+            continue;
+        }
+        if (dstate->active[i].curr[1].report != INVALID_REPORT) {
+            u32 next_trigger = dstate->active[i].curr[1].repeats - 1ULL
+                             - *get_counter_for_kilo(dstate, &kp[i]);
+            DEBUG_PRINTF("next trigger %u\n", next_trigger);
+            lim = MIN(lim, next_trigger);
+        }
+
+        if (lim != length) {
+            struct mpv_pq_item temp = {
+                .trigger_loc = lim,
+                .kilo = i
+            };
+
+            DEBUG_PRINTF("push for %u at %llu\n", i, lim);
+            pq_insert(pq, dstate->pq_size, temp);
+            ++dstate->pq_size;
+        }
+
+        assert(lim || kp[i].auto_restart);
+    }
+
+    DEBUG_PRINTF("filled\n");
+    dstate->filled = 1;
+}
+
+static never_inline
+void handleTopN(const struct mpv *m, s64a loc, u8 *active, u8 *reporters,
+                struct mpv_decomp_state *dstate, struct mpv_pq_item *pq,
+                const u8 *buf, size_t length, u32 i) {
+    assert(i < m->kilo_count);
+    DEBUG_PRINTF("MQE_TOP + %u @%lld\n", i, loc);
+    if (mmbit_set(active, m->kilo_count, i)) {
+        DEBUG_PRINTF("kilo is already alive and kicking\n");
+        return;
+    }
+
+    const struct mpv_kilopuff *kp = (const struct mpv_kilopuff *)(m + 1);
+
+    assert(!kp[i].auto_restart); /* handle later/never */
+
+    /* we need to ensure that the counters are upto date */
+    normalize_counters(dstate, m);
+
+    /* reset counter */
+    *get_counter_for_kilo(dstate, &kp[i]) = 0;
+
+    if ((size_t)loc == length) {
+        /* end of buffer, just make sure it is active */
+        dstate->active[i].limit = loc;
+        dstate->active[i].curr = get_init_puff(m, &kp[i]);
+        return;
+    }
+
+    /* find the limit */
+    u64a lim = limitByReach(&kp[i], buf + loc, length - loc) + loc;
+
+    /* no need to worry about dead_point triggers here as kilopuff must first
+     * update chain (to fire a report) before it goes dead. */
+
+    if (lim == (u64a)loc) {
+        DEBUG_PRINTF("dead on arrival\n");
+        mmbit_unset(active, m->kilo_count, i);
+        return;
+    }
+    dstate->active[i].limit = lim;
+
+    /* setup puffette, find next trigger */
+    dstate->active[i].curr = get_init_puff(m, &kp[i]);
+    if (dstate->active[i].curr[1].report != INVALID_REPORT) {
+        u32 next_trigger = dstate->active[i].curr[1].repeats - 1ULL + loc;
+        lim = MIN(lim, next_trigger);
+    }
+
+    assert(dstate->active[i].curr[0].repeats == 1
+           || dstate->active[i].curr[0].report == INVALID_REPORT);
+    if (dstate->active[i].curr[0].repeats == 1) {
+        DEBUG_PRINTF("yippee\n");
+        mmbit_set(reporters, m->kilo_count, i);
+    }
+
+    assert(lim > (u64a)loc);
+
+    /* add to pq */
+    if (lim != length) {
+        struct mpv_pq_item temp = {
+            .trigger_loc = lim,
+            .kilo = i
+        };
+
+        DEBUG_PRINTF("push for %u at %llu\n", i, lim);
+        pq_insert(pq, dstate->pq_size, temp);
+        ++dstate->pq_size;
+    }
+}
+
+static really_inline
+void killKilo(const struct mpv *m, u8 *active, u8 *reporters,
+              struct mpv_decomp_state *dstate, struct mpv_pq_item *pq, u32 i) {
+    DEBUG_PRINTF("squashing kilo %u (progress %llu, limit %llu)\n",
+                 i, pq_top(pq)->trigger_loc, dstate->active[i].limit);
+    mmbit_unset(active, m->kilo_count, i);
+    mmbit_unset(reporters, m->kilo_count, i);
+
+    pq_pop(pq, dstate->pq_size);
+    dstate->pq_size--;
+}
+
+static really_inline
+void updateKiloChains(const struct mpv *m, u8 *reporters,
+                      struct mpv_decomp_state *dstate, struct mpv_pq_item *pq,
+                      u64a curr_loc, size_t buf_length, u32 i) {
+    const struct mpv_kilopuff *kp = (const void *)(m + 1);
+    u64a counter = get_counter_value_for_kilo(dstate, &kp[i]);
+
+    DEBUG_PRINTF("updating active puff for kilo %u\n", i);
+    dstate->active[i].curr = update_curr_puff(m, reporters, counter,
+                                              dstate->active[i].curr, i);
+
+    u64a next_trigger = dstate->active[i].limit;
+
+    if (dstate->active[i].curr[1].report != INVALID_REPORT) {
+        u64a next_rep_trigger = dstate->active[i].curr[1].repeats - 1 - counter
+                             + curr_loc;
+
+        next_trigger = MIN(next_trigger, next_rep_trigger);
+    } else if (kp[i].dead_point != MPV_DEAD_VALUE) {
+        u64a dp_trigger = kp[i].dead_point - counter + curr_loc;
+        DEBUG_PRINTF("dead point trigger %llu\n", dp_trigger);
+        if (dp_trigger < dstate->active[i].limit) {
+            dstate->active[i].limit = dp_trigger;
+            next_trigger = dp_trigger;
+        }
+    }
+
+    DEBUG_PRINTF("next trigger location is %llu\n", next_trigger);
+
+    if (next_trigger < buf_length) {
+        assert(dstate->pq_size <= m->kilo_count);
+        assert(next_trigger > pq_top(pq)->trigger_loc);
+        struct mpv_pq_item temp = {
+            .trigger_loc = next_trigger,
+            .kilo = i
+        };
+
+        DEBUG_PRINTF("(replace) push for %u at %llu\n", i, next_trigger);
+        pq_replace_top(pq, dstate->pq_size, temp);
+    } else {
+        pq_pop(pq, dstate->pq_size);
+        dstate->pq_size--;
+        DEBUG_PRINTF("PQ_POP\n");
+    }
+    DEBUG_PRINTF("pq size now %u next top %llu\n", dstate->pq_size,
+                 pq_top(pq)->trigger_loc);
+}
+
+static really_inline
+u8 do_single_shufti(const m128 l, const m128 h, u8 c) {
+    const u8 *lo = (const u8 *)&l;
+    const u8 *hi = (const u8 *)&h;
+    return lo[c & 0xf] & hi[c >> 4];
+}
+
+static really_inline
+size_t find_last_bad(const struct mpv_kilopuff *kp, const u8 *buf,
+                     size_t length, size_t curr, u32 min_rep) {
+    assert(kp->type != MPV_DOT);
+
+    DEBUG_PRINTF("repeats = %u\n", min_rep);
+    /* TODO: this should be replace by some sort of simd stuff */
+
+    if (kp->type == MPV_VERM) {
+        if (min_rep < MIN_SKIP_REPEAT) {
+            return find_nverm_run(kp->u.verm.c, 0, min_rep, buf, buf + curr,
+                                  buf + length) - buf - 1;
+        }
+
+    verm_restart:;
+        assert(buf[curr] == kp->u.verm.c);
+        size_t test = curr;
+        if (curr + min_rep < length) {
+            test = curr + min_rep;
+        } else {
+            test = length - 1;
+        }
+
+        while (test > curr) {
+            if (buf[test] == kp->u.verm.c) {
+                curr = test;
+                if (curr == length - 1) {
+                    return curr;
+                }
+                goto verm_restart;
+            }
+            --test;
+        }
+    } else if (kp->type == MPV_SHUFTI) {
+        m128 lo = kp->u.shuf.mask_lo;
+        m128 hi = kp->u.shuf.mask_hi;
+    shuf_restart:
+        assert(do_single_shufti(lo, hi, buf[curr]));
+        size_t test = curr;
+        if (curr + min_rep < length) {
+            test = curr + min_rep;
+        } else {
+            test = length - 1;
+        }
+
+        while (test > curr) {
+            if (do_single_shufti(lo, hi, buf[test])) {
+                DEBUG_PRINTF("updating curr from %zu to %zu\n", curr, test);
+                curr = test;
+                if (curr == length - 1) {
+                    return curr;
+                }
+                goto shuf_restart;
+            }
+            --test;
+        }
+    } else if (kp->type == MPV_TRUFFLE) {
+        const m128 mask1 = kp->u.truffle.mask1;
+        const m128 mask2 = kp->u.truffle.mask2;
+    truffle_restart:;
+        size_t test = curr;
+        if (curr + min_rep < length) {
+            test = curr + min_rep;
+        } else {
+            test = length - 1;
+        }
+
+        while (test > curr) {
+            const u8 *rv = truffleExec(mask1, mask2, buf + test, buf + test + 1);
+            if (rv == buf + test) {
+                curr = test;
+                if (curr == length - 1) {
+                    return curr;
+                }
+                goto truffle_restart;
+            }
+            --test;
+        }
+    } else if (kp->type == MPV_NVERM) {
+        if (min_rep < MIN_SKIP_REPEAT) {
+            return find_verm_run(kp->u.verm.c, 0, min_rep, buf, buf + curr,
+                                 buf + length) - buf - 1;
+        }
+
+    nverm_restart:;
+        assert(buf[curr] != kp->u.verm.c);
+        size_t test = curr;
+        if (curr + min_rep < length) {
+            test = curr + min_rep;
+        } else {
+            test = length - 1;
+        }
+
+        while (test > curr) {
+            if (buf[test] != kp->u.verm.c) {
+                curr = test;
+                if (curr == length - 1) {
+                    return curr;
+                }
+                goto nverm_restart;
+            }
+            --test;
+        }
+    } else {
+        assert(0);
+    }
+
+    return curr;
+}
+
+static really_inline
+void restartKilo(const struct mpv *m, UNUSED u8 *active, u8 *reporters,
+                 struct mpv_decomp_state *dstate, struct mpv_pq_item *pq,
+                 const u8 *buf, u64a prev_limit, size_t buf_length, u32 i) {
+    const struct mpv_kilopuff *kp = (const void *)(m + 1);
+    assert(kp[i].auto_restart);
+    assert(mmbit_isset(active, m->kilo_count, i));
+
+    DEBUG_PRINTF("we got to %llu,%llu\n", prev_limit, dstate->active[i].limit);
+    assert(prev_limit == dstate->active[i].limit);
+
+    DEBUG_PRINTF("resetting counter\n");
+
+    /* we need to ensure that the counters are upto date */
+    normalize_counters(dstate, m);
+
+    /* current byte is dead, will wrap to 0 after processing this byte */
+    assert(MPV_DEAD_VALUE + 1 == 0);
+    *get_counter_for_kilo(dstate, &kp[i]) = MPV_DEAD_VALUE;
+
+    DEBUG_PRINTF("resetting puffettes\n");
+    dstate->active[i].curr = get_init_puff(m, &kp[i]);
+
+    assert(dstate->active[i].curr[0].report == INVALID_REPORT);
+    /* TODO: handle restart .{1,}s */
+
+    mmbit_unset(reporters, m->kilo_count, i);
+
+    if (prev_limit != buf_length - 1) {
+        size_t last_bad = find_last_bad(&kp[i], buf, buf_length, prev_limit,
+                                        dstate->active[i].curr[1].repeats);
+        assert(last_bad >= prev_limit && last_bad < buf_length);
+        if (last_bad != prev_limit) {
+            /* there is no point in getting restarted at this location */
+            dstate->active[i].limit = last_bad;
+            assert(dstate->pq_size <= m->kilo_count);
+            struct mpv_pq_item temp = {
+                .trigger_loc = last_bad,
+                .kilo = i
+            };
+
+            pq_replace_top(pq, dstate->pq_size, temp);
+            return;
+        }
+    }
+
+    /* TODO: skipping would really come in handy about now */
+    u64a lim;
+    if (buf_length > prev_limit + 1) {
+        lim = limitByReach(&kp[i], buf + prev_limit + 1,
+                           buf_length - (prev_limit + 1)) +
+              prev_limit + 1;
+    } else {
+        assert(buf_length == prev_limit + 1);
+        lim = buf_length;
+    }
+    DEBUG_PRINTF("next limit is %llu\n", lim);
+
+    assert(lim > prev_limit);
+
+    dstate->active[i].limit = lim;
+
+    if (dstate->active[i].curr[1].report != INVALID_REPORT) {
+        u32 next_trigger = dstate->active[i].curr[1].repeats + prev_limit;
+        lim = MIN(lim, next_trigger);
+    }
+
+    DEBUG_PRINTF("next trigger for kilo at %llu\n", lim);
+
+    if (lim < buf_length) {
+        assert(dstate->pq_size <= m->kilo_count);
+        assert(lim >= prev_limit);
+        struct mpv_pq_item temp = {
+            .trigger_loc = lim,
+            .kilo = i
+        };
+
+        pq_replace_top(pq, dstate->pq_size, temp);
+    } else {
+        pq_pop(pq, dstate->pq_size);
+        dstate->pq_size--;
+    }
+}
+
+static really_inline
+void handle_events(const struct mpv *m, u8 *active, u8 *reporters,
+                   struct mpv_decomp_state *dstate, struct mpv_pq_item *pq,
+                   u64a loc,  const u8 *buf, size_t buf_length) {
+    const struct mpv_kilopuff *kp = (const void *)(m + 1);
+
+    while (dstate->pq_size && pq_top(pq)->trigger_loc <= loc) {
+        assert(pq_top(pq)->trigger_loc == loc);
+
+        u32 kilo = pq_top(pq)->kilo;
+
+        DEBUG_PRINTF("pop for kilo %u at %llu\n", kilo,
+                     pq_top(pq)->trigger_loc);
+
+        if (dstate->active[kilo].limit <= loc) {
+           if (!kp[kilo].auto_restart) {
+                killKilo(m, active, reporters, dstate, pq, kilo);
+            } else {
+                restartKilo(m, active, reporters, dstate, pq, buf, loc,
+                            buf_length, kilo);
+            }
+        } else {
+            updateKiloChains(m, reporters, dstate, pq, loc, buf_length, kilo);
+        }
+    }
+}
+
+static really_inline
+u64a find_next_limit(const struct mpv *m, u8 *active, u8 *reporters,
+                     struct mpv_decomp_state *dstate, struct mpv_pq_item *pq,
+                     const u8 *buf, u64a prev_limit, u64a ep,
+                     size_t buf_length) {
+    u64a limit = ep;
+
+    DEBUG_PRINTF("length %llu (prev %llu), pq %u\n", limit, prev_limit,
+                 dstate->pq_size);
+
+    handle_events(m, active, reporters, dstate, pq, prev_limit, buf,
+                  buf_length);
+
+    if (dstate->pq_size) {
+        limit = MIN(pq_top(pq)->trigger_loc, limit);
+        assert(limit > prev_limit);
+    }
+
+    DEBUG_PRINTF("limit now %llu\n", limit);
+    return limit;
+}
+
+static really_inline
+char mpvExec(const struct mpv *m, u8 *active, u8 *reporters,
+             struct mpv_decomp_state *dstate, struct mpv_pq_item *pq,
+             const u8 *buf, s64a start, size_t length, size_t buf_length,
+             u64a offsetAdj, NfaCallback cb, void *ctxt) {
+    DEBUG_PRINTF("running mpv (s %lliu, l %zu, o %llu)\n",
+                 *get_counter_n(dstate, m, 0) + dstate->counter_adj, length,
+                 offsetAdj);
+
+    u64a progress = start; /* progress is relative to buffer offsets */
+
+    while (progress < length) {
+        DEBUG_PRINTF("progress %llu\n", progress);
+
+        /* find next limit and update chains */
+        u64a limit = find_next_limit(m, active, reporters, dstate, pq, buf,
+                                     progress, length, buf_length);
+        assert(limit != progress);
+        u64a incr = limit - progress;
+        DEBUG_PRINTF("incr = %llu\n", incr);
+
+        /* report matches upto next limit */
+        char rv = processReportsForRange(m, reporters, dstate,
+                                         offsetAdj + progress, limit - progress,
+                                         cb, ctxt);
+
+        if (rv != MO_CONTINUE_MATCHING) {
+            DEBUG_PRINTF("mpvExec done %llu/%zu\n", progress, length);
+            return rv;
+        }
+
+        dstate->counter_adj += incr;
+        progress = limit;
+    }
+
+    assert(progress == length);
+
+    DEBUG_PRINTF("mpvExec done\n");
+    return MO_CONTINUE_MATCHING;
+}
+
+static really_inline
+void mpvLoadState(struct mpv_decomp_state *out, const struct NFA *n,
+                  const char *state) {
+    assert(16 >= sizeof(struct mpv_decomp_kilo));
+    assert(sizeof(*out) <= n->scratchStateSize);
+    assert(ISALIGNED(out));
+
+    const struct mpv *m = getImplNfa(n);
+    const struct mpv_counter_info *counter_info = get_counter_info(m);
+    u64a *counters = get_counter_n(out, m, 0);
+    const char *comp_counter = state;
+    for (u32 i = 0; i < m->counter_count; i++) {
+        u32 counter_size = counter_info[i].counter_size;
+        counters[i] = partial_load_u64a(comp_counter, counter_size);
+        DEBUG_PRINTF("loaded %llu counter %u\n", counters[i], i);
+        comp_counter += counter_size;
+    }
+
+    out->filled = 0; /* _Q_i will fill limits, curr puffetes, and populate pq
+                      * on first call */
+    out->counter_adj = 0;
+    out->pq_size = 0;
+
+    u8 *reporters = (u8 *)out + m->reporter_offset;
+
+    mmbit_clear(reporters, m->kilo_count);
+}
+
+static really_inline
+void mpvStoreState(const struct NFA *n, char *state,
+                   const struct mpv_decomp_state *in) {
+    assert(ISALIGNED(in));
+    const struct mpv *m = getImplNfa(n);
+    const struct mpv_counter_info *counter_info = get_counter_info(m);
+
+    const u64a *counters = (const u64a *)((const char *)in
+                                       + get_counter_info(m)[0].counter_offset);
+    u64a adj = in->counter_adj;
+    char *comp_counter = state;
+    for (u32 i = 0; i < m->counter_count; i++) {
+        /* clamp counter to allow storage in smaller ints */
+        u64a curr_counter = MIN(counters[i] + adj, counter_info[i].max_counter);
+
+        u32 counter_size = counter_info[i].counter_size;
+        partial_store_u64a(comp_counter, curr_counter, counter_size);
+        DEBUG_PRINTF("stored %llu counter %u (orig %llu)\n", curr_counter, i,
+                     counters[i]);
+        /* assert(counters[i] != MPV_DEAD_VALUE); /\* should have process 1 byte */
+        /*                                         * since a clear *\/ */
+        comp_counter += counter_size;
+    }
+}
+
+char nfaExecMpv_queueCompressState(const struct NFA *nfa, const struct mq *q,
+                                   UNUSED s64a loc) {
+    void *dest = q->streamState;
+    const void *src = q->state;
+    mpvStoreState(nfa, dest, src);
+    return 0;
+}
+
+char nfaExecMpv_expandState(const struct NFA *nfa, void *dest, const void *src,
+                            UNUSED u64a offset, UNUSED u8 key) {
+    mpvLoadState(dest, nfa, src);
+    return 0;
+}
+
+char nfaExecMpv_reportCurrent(const struct NFA *n, struct mq *q) {
+    const struct mpv *m = getImplNfa(n);
+    u64a offset = q_cur_offset(q);
+    struct mpv_decomp_state *s = (struct mpv_decomp_state *)q->state;
+
+    DEBUG_PRINTF("report current: offset %llu\n", offset);
+
+    u8 *active = (u8 *)q->streamState + m->active_offset;
+    u32 rl_count = 0;
+    ReportID *rl = get_report_list(m, s);
+
+    processReports(m, active, s, s->counter_adj, offset, q->cb, q->context, rl,
+                   &rl_count);
+    return 0;
+}
+
+char nfaExecMpv_queueInitState(const struct NFA *n, struct mq *q) {
+    struct mpv_decomp_state *out = (void *)q->state;
+    const struct mpv *m = getImplNfa(n);
+    assert(sizeof(*out) <= n->scratchStateSize);
+
+    DEBUG_PRINTF("queue init state\n");
+
+    u64a *counters = get_counter_n(out, m, 0);
+    for (u32 i = 0; i < m->counter_count; i++) {
+        counters[i] = MPV_DEAD_VALUE;
+    }
+
+    out->filled = 0;
+    out->counter_adj = 0;
+    out->pq_size = 0;
+    out->active[0].curr = NULL;
+
+    assert(q->streamState);
+    u8 *active_kpuff = (u8 *)q->streamState + m->active_offset;
+    u8 *reporters = (u8 *)q->state + m->reporter_offset;
+    mmbit_clear(active_kpuff, m->kilo_count);
+    mmbit_clear(reporters, m->kilo_count);
+    return 0;
+}
+
+char nfaExecMpv_initCompressedState(const struct NFA *n, u64a offset,
+                                    void *state, UNUSED u8 key) {
+    const struct mpv *m = getImplNfa(n);
+    memset(state, 0, m->active_offset); /* active_offset marks end of comp
+                                         * counters */
+    u8 *active_kpuff = (u8 *)state + m->active_offset;
+    if (!offset) {
+        mmbit_init_range(active_kpuff, m->kilo_count, m->top_kilo_begin,
+                         m->top_kilo_end);
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+static really_inline
+char nfaExecMpv_Q_i(const struct NFA *n, struct mq *q, s64a end) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    size_t length = q->length;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    s64a sp;
+    const struct mpv *m = getImplNfa(n);
+    struct mpv_decomp_state *s = (struct mpv_decomp_state *)q->state;
+    u8 *active = (u8 *)q->streamState + m->active_offset;
+    u8 *reporters = (u8 *)q->state + m->reporter_offset;
+    struct mpv_pq_item *pq = (struct mpv_pq_item *)(q->state + m->pq_offset);
+
+    if (!s->filled) {
+        fillLimits(m, active, reporters, s, pq, q->buffer, q->length);
+    }
+
+    assert(!q->report_current);
+
+    if (q->cur == q->end) {
+        return 1;
+    }
+
+    assert(q->cur + 1 < q->end); /* require at least two items */
+
+    assert(q_cur_type(q) == MQE_START);
+    assert(q_cur_loc(q) >= 0);
+    sp = q->items[q->cur].location;
+    q->cur++;
+
+    if (q->items[q->cur - 1].location > end) {
+        /* this is as far as we go */
+        q->cur--;
+        q->items[q->cur].type = MQE_START;
+        q->items[q->cur].location = end;
+        return MO_ALIVE;
+    }
+
+    while (q->cur < q->end) {
+        s64a ep = q->items[q->cur].location;
+
+        ep = MIN(ep, end);
+
+        assert(ep >= sp);
+
+        assert(sp >= 0); /* mpv should be an outfix; outfixes are not lazy */
+
+        if (sp >= ep) {
+            goto scan_done;
+        }
+
+        /* do main buffer region */
+        assert((u64a)ep <= length);
+        char rv = mpvExec(m, active, reporters, s, pq, buffer, sp, ep, length,
+                          offset, cb, context);
+        if (rv == MO_HALT_MATCHING) {
+            q->cur = q->end;
+            return 0;
+        }
+
+    scan_done:
+        if (q->items[q->cur].location > end) {
+            /* this is as far as we go */
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = end;
+            return MO_ALIVE;
+        }
+
+        sp = ep;
+
+        switch (q->items[q->cur].type) {
+        case MQE_TOP:
+            DEBUG_PRINTF("top %u %u\n", m->top_kilo_begin, m->top_kilo_end);
+            /* MQE_TOP initialise all counters to 0; activates all kilos */
+            {
+                u64a *counters = get_counter_n(s, m, 0);
+                assert(counters[0] == MPV_DEAD_VALUE);
+                assert(!s->counter_adj);
+                for (u32 i = 0; i < m->counter_count; i++) {
+                    counters[i] = 0;
+                }
+                mmbit_init_range(active, m->kilo_count, m->top_kilo_begin,
+                                 m->top_kilo_end);
+                fillLimits(m, active, reporters, s, pq, buffer, length);
+            }
+            break;
+        case MQE_START:
+        case MQE_END:
+            break;
+        default:
+            /* MQE_TOP_N --> switch on kilo puff N */
+            assert(q->items[q->cur].type >= MQE_TOP_FIRST);
+            assert(q->items[q->cur].type < MQE_INVALID);
+            u32 i = q->items[q->cur].type - MQE_TOP_FIRST;
+            handleTopN(m, sp, active, reporters, s, pq, buffer, length, i);
+            break;
+        }
+
+        q->cur++;
+    }
+
+    char alive = 0;
+    assert(q->items[q->cur - 1].type == MQE_END);
+    if (q->items[q->cur - 1].location == (s64a)q->length) {
+        normalize_counters(s, m);
+
+        const struct mpv_kilopuff *kp = (const struct mpv_kilopuff *)(m + 1);
+        for (u32 i = mmbit_iterate(active, m->kilo_count, MMB_INVALID);
+             i != MMB_INVALID; i = mmbit_iterate(active, m->kilo_count, i)) {
+            if (*get_counter_for_kilo(s, &kp[i]) >= kp[i].dead_point) {
+                mmbit_unset(active, m->kilo_count, i);
+            } else {
+                alive = 1;
+            }
+        }
+    } else {
+        alive
+            = mmbit_iterate(active, m->kilo_count, MMB_INVALID) != MMB_INVALID;
+    }
+
+    DEBUG_PRINTF("finished %d\n", (int)alive);
+    return alive;
+}
+
+char nfaExecMpv_Q(const struct NFA *n, struct mq *q, s64a end) {
+    DEBUG_PRINTF("_Q %lld\n", end);
+    return nfaExecMpv_Q_i(n, q, end);
+}
+
+s64a nfaExecMpv_QueueExecRaw(const struct NFA *nfa, struct mq *q, s64a end) {
+    DEBUG_PRINTF("nfa=%p end=%lld\n", nfa, end);
+#ifdef DEBUG
+    debugQueue(q);
+#endif
+
+    assert(nfa->type == MPV_NFA);
+    assert(q && q->context && q->state);
+    assert(end >= 0);
+    assert(q->cur < q->end);
+    assert(q->end <= MAX_MQE_LEN);
+    assert(ISALIGNED_16(nfa) && ISALIGNED_16(getImplNfa(nfa)));
+    assert(end < q->items[q->end - 1].location
+           || q->items[q->end - 1].type == MQE_END);
+
+    if (q->items[q->cur].location > end) {
+        return 1;
+    }
+
+    char q_trimmed = 0;
+
+    assert(end <= (s64a)q->length || !q->hlength);
+    /* due to reverse accel in block mode some queues may work on a truncated
+     * buffer */
+    if (end > (s64a)q->length) {
+        end = q->length;
+        q_trimmed = 1;
+    }
+
+    /* TODO: restore max offset stuff, if/when _interesting_ max offset stuff
+     * is filled in */
+
+    char rv = nfaExecMpv_Q_i(nfa, q, end);
+
+    assert(!q->report_current);
+    DEBUG_PRINTF("returned rv=%d, q_trimmed=%d\n", rv, q_trimmed);
+    if (q_trimmed || !rv) {
+        return 0;
+    } else {
+        const struct mpv *m = getImplNfa(nfa);
+        u8 *reporters = (u8 *)q->state + m->reporter_offset;
+
+        if (mmbit_any_precise(reporters, m->kilo_count)) {
+            DEBUG_PRINTF("next byte\n");
+            return 1; /* need to match at next byte */
+        } else {
+            s64a next_event = q->length;
+            s64a next_pq = q->length;
+
+            if (q->cur < q->end) {
+                next_event = q->items[q->cur].location;
+            }
+
+            struct mpv_decomp_state *s = (struct mpv_decomp_state *)q->state;
+            struct mpv_pq_item *pq
+                = (struct mpv_pq_item *)(q->state + m->pq_offset);
+            if (s->pq_size) {
+                next_pq = pq_top(pq)->trigger_loc;
+            }
+
+            assert(next_event);
+            assert(next_pq);
+
+            DEBUG_PRINTF("next pq %lld event %lld\n", next_pq, next_event);
+            return MIN(next_pq, next_event);
+        }
+    }
+}
diff --git a/regex/nfa/mpv.h b/regex/nfa/mpv.h
new file mode 100644
index 000000000..3780728d7
--- /dev/null
+++ b/regex/nfa/mpv.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef MPV_H
+#define MPV_H
+
+#include "ue2common.h"
+
+struct mq;
+struct NFA;
+
+char nfaExecMpv_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecMpv_reportCurrent(const struct NFA *n, struct mq *q);
+char nfaExecMpv_queueInitState(const struct NFA *n, struct mq *q);
+char nfaExecMpv_initCompressedState(const struct NFA *n, u64a offset,
+                                    void *state, u8 key);
+char nfaExecMpv_queueCompressState(const struct NFA *nfa, const struct mq *q,
+                                   s64a loc);
+char nfaExecMpv_expandState(const struct NFA *nfa, void *dest, const void *src,
+                            u64a offset, u8 key);
+
+#define nfaExecMpv_testEOD NFA_API_NO_IMPL
+#define nfaExecMpv_inAccept NFA_API_NO_IMPL
+#define nfaExecMpv_inAnyAccept NFA_API_NO_IMPL
+#define nfaExecMpv_QR NFA_API_NO_IMPL
+#define nfaExecMpv_Q2 NFA_API_NO_IMPL /* for non-chained suffixes. */
+#define nfaExecMpv_B_Reverse NFA_API_NO_IMPL
+#define nfaExecMpv_zombie_status NFA_API_ZOMBIE_NO_IMPL
+
+/**
+ * return 0 if the mpv dies, otherwise returns the location of the next possible
+ * match (given the currently known events). */
+s64a nfaExecMpv_QueueExecRaw(const struct NFA *nfa, struct mq *q, s64a end);
+
+#endif
diff --git a/regex/nfa/mpv_internal.h b/regex/nfa/mpv_internal.h
new file mode 100644
index 000000000..a52853dce
--- /dev/null
+++ b/regex/nfa/mpv_internal.h
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef MPV_INTERNAL_H
+#define MPV_INTERNAL_H
+
+#include "ue2common.h"
+
+#define MPV_DOT    0
+#define MPV_VERM   1
+#define MPV_SHUFTI 2
+#define MPV_TRUFFLE 3
+#define MPV_NVERM  4
+
+struct mpv_puffette {
+    u32 repeats;
+    char unbounded;
+
+    /**
+     * \brief Report is simple-exhaustible.
+     *
+     * If this is true, we do best-effort suppression of runs of reports, only
+     * delivering the first one.
+     */
+    char simple_exhaust;
+
+    ReportID report;
+};
+
+struct mpv_kilopuff {
+    u32 counter_offset; /**< offset (in full stream state) to the counter that
+                         * this kilopuff refers to */
+    u32 count; /**< number of real (non sentinel mpv puffettes) */
+    u32 puffette_offset; /**< relative to base of mpv, points past the 1st
+                          * sent */
+    u64a dead_point;
+    u8 auto_restart;
+    u8 type; /* MPV_DOT, MPV_VERM, etc */
+    union {
+        struct {
+            char c;
+        } verm;
+        struct {
+            m128 mask_lo;
+            m128 mask_hi;
+        } shuf;
+        struct {
+            m128 mask1;
+            m128 mask2;
+        } truffle;
+    } u;
+};
+
+struct mpv_counter_info {
+    u64a max_counter; /**< maximum value this counter needs to track */
+    u32 counter_size; /**< number of bytes to represent the counter in stream
+                       * state */
+    u32 counter_offset; /**< offset that this counter is stored at in the
+                         * full stream state */
+    u32 kilo_begin; /**< first kilo to turn on when the counter is started */
+    u32 kilo_end; /**< 1 + last kilo to turn on when the counter is started */
+};
+
+struct ALIGN_AVX_DIRECTIVE mpv {
+    u32 kilo_count; /**< number of kilopuffs following */
+    u32 counter_count; /**< number of counters managed by the mpv */
+    u32 puffette_count; /**< total number of puffettes under all the kilos */
+    u32 pq_offset; /**< offset to the priority queue in the decompressed
+                    * state */
+    u32 reporter_offset; /**< offset to the reporter mmbit in the decompressed
+                          * state */
+    u32 report_list_offset; /**< offset to the report list scratch space in the
+                             * decompressed state */
+    u32 active_offset; /**< offset to the active kp mmbit in the compressed
+                        * state */
+    u32 top_kilo_begin; /**< first kilo to switch on when top arrives */
+    u32 top_kilo_end; /**< one past the last kilo to switch on when top
+                       * arrives */
+};
+
+struct mpv_decomp_kilo {
+    u64a limit;
+    const struct mpv_puffette *curr;
+};
+
+/* note: size varies on different platforms */
+struct mpv_decomp_state {
+    u32 pq_size;
+    char filled;
+    u64a counter_adj; /**< progress not yet written to the real counters */
+    struct mpv_decomp_kilo active[];
+};
+
+/* ---
+ * | | mpv
+ * ---
+ * | |
+ * | | kilo_count * mpv_kilopuffs
+ * | |
+ * ...
+ * | |
+ * ---
+ * | |
+ * | | counter_count * mpv_counter_infos
+ * | |
+ * ...
+ * | |
+ * ---
+ * | | sentinel mpv_puffette
+ * ---
+ * | | mpv_puffettes for 1st kilopuff
+ * | | (mpv_puffettes are ordered by minimum number of repeats)
+ * | |
+ * ---
+ * | | sentinel mpv_puffette
+ * ---
+ * | | mpv_puffettes for 2nd kilopuff
+ * ...
+ * | |
+ * ---
+ * | | sentinel mpv_puffette
+ * ---
+ */
+
+/*
+ * Stream State
+ * [Compressed Counter 0]
+ * [Compressed Counter 1]
+ * ...
+ * [Compressed Counter N]
+ * [mmbit of active kilopuffs]
+ *
+ * Decompressed State
+ * [header (limit pq_size)]
+ * [
+ *     [kilo 1 current reports]
+ *      ...
+ *     [kilo N current reports]
+ * ]
+ * [
+ *     [Full Counter 0]
+ *     [Full Counter 1]
+ *     ...
+ *     [Full Counter N]
+ * ]
+ * [pq of kilo changes]
+ * [scratch space for current report lists (total number of puffettes)]
+ * [mmbit of kilopuffs with active reports]
+ */
+
+struct mpv_pq_item {
+    u64a trigger_loc;
+    u32 kilo;
+};
+
+/* returns pointer to first non sentinel mpv_puff */
+static really_inline
+const struct mpv_puffette *get_puff_array(const struct mpv *m,
+                                          const struct mpv_kilopuff *kp) {
+    return (const struct mpv_puffette *)((const char *)m + kp->puffette_offset);
+}
+
+static really_inline
+const struct mpv_counter_info *get_counter_info(const struct mpv *m) {
+    return (const struct mpv_counter_info *)((const char *)(m + 1)
+                                 + m->kilo_count * sizeof(struct mpv_kilopuff));
+}
+
+#define MPV_DEAD_VALUE (~0ULL)
+#define INVALID_REPORT (~0U)
+
+#endif
diff --git a/regex/nfa/nfa_api.h b/regex/nfa/nfa_api.h
new file mode 100644
index 000000000..e3f7f7431
--- /dev/null
+++ b/regex/nfa/nfa_api.h
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Declarations for the main NFA Engine API.
+ *
+ * This file provides the internal API for all runtime engines ("NFAs", even if
+ * they're not strictly NFA implementations).
+ */
+
+#ifndef NFA_API_H
+#define NFA_API_H
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#include "callback.h"
+#include "ue2common.h"
+
+struct mq;
+struct NFA;
+
+/**
+ * Indicates if an nfa is a zombie. Note: that there were plans for a more
+ * nuanced view of zombiehood but this never eventuated.
+ */
+enum nfa_zombie_status {
+    NFA_ZOMBIE_NO, /**< nfa is not a zombie and will respond to top events */
+    NFA_ZOMBIE_ALWAYS_YES /**< nfa is a zombie and will always be a zombie */
+};
+
+/**
+ * Compresses an engine's state.
+ * The expanded state (@ref mq::state, @ref mq::streamState) is reduced purely
+ * to a corresponding compressed stream state (@ref mq::streamState).
+ *
+ * @param nfa engine the state belongs to
+ * @param q queue for the engine. The final compressed stream stream is placed
+ *        in the location indicated by @ref mq::streamState
+ * @param loc the location corresponding to the engine's current state
+ */
+char nfaQueueCompressState(const struct NFA *nfa, const struct mq *q, s64a loc);
+
+/**
+ * Expands an engine's compressed stream state, into its scratch space
+ * representation. This is required before an engine starts operating over its
+ * queue.
+ *
+ * @param nfa engine the state belongs to
+ * @param dest location in scratch for decompressed state
+ * @param src compressed stream state
+ * @param offset the current stream offset.
+ * @param key byte corresponding to the location where the compressed state was
+ *        created.
+ */
+char nfaExpandState(const struct NFA *nfa, void *dest, const void *src,
+                    u64a offset, u8 key);
+
+/**
+ * Gives us a properly initialised dead state suitable for later @ref
+ * nfaQueueExec calls.
+ */
+char nfaQueueInitState(const struct NFA *nfa, struct mq *q);
+
+/**
+ * Initialise the state, applying a TOP appropriate for the offset. If the
+ * NFA becomes inactive, return zero. Otherwise, write out its compressed
+ * representation to `state' and return non-zero.
+ *
+ * @param nfa engine the state belongs to
+ * @param offset offset in the stream (relative to start of stream)
+ * @param state pointer indicating where the state is to be written
+ * @param key byte corresponding to the location where the compressed state is
+ *        to be created.
+ */
+char nfaInitCompressedState(const struct NFA *nfa, u64a offset, void *state,
+                            u8 key);
+
+/**
+ * Process the queued commands on the given NFA.
+ *
+ * @param nfa the NFA to execute
+ * @param q the queued commands. It must start with some variant of start and
+ *        end with some variant of end. The location field of the events must
+ *        be monotonically increasing.
+ * @param end stop processing command queue when we reach this point
+ *
+ * @return non-zero if the nfa is still active, if the nfa is not active the
+ *         state data is undefined
+ *
+ * Note: this function can not process events from the past: the location field
+ * of each event must be >= current offset.
+ */
+char nfaQueueExec(const struct NFA *nfa, struct mq *q, s64a end);
+
+/**
+ * Main execution function that doesn't perform the checks and optimisations of
+ * nfaQueueExec() and just dispatches directly to the nfa implementations. It is
+ * intended to be used by the Tamarama engine.
+ */
+char nfaQueueExec_raw(const struct NFA *nfa, struct mq *q, s64a end);
+
+/** Return value indicating that the engine is dead. */
+#define MO_DEAD 0
+
+/** Return value indicating that the engine is alive. */
+#define MO_ALIVE 1
+
+/** Return value from @ref nfaQueueExecToMatch indicating that engine progress
+ * stopped as a match state was reached. */
+#define MO_MATCHES_PENDING 2
+
+/**
+ * Process the queued commands on the given nfa up to end or the first match.
+ * This function will only fire the callback in response to an report_current
+ * being set and accepts at the starting offset, in all other situations accepts
+ * will result in the queue pausing with a return value of
+ * @ref MO_MATCHES_PENDING.
+ *
+ * @param nfa the NFA to execute
+ * @param q the queued commands. It must start with some variant of start and
+ *        end with some variant of end. The location field of the events must
+ *        be monotonically increasing. If not all the data was processed during
+ *        the call, the queue is updated to reflect the remaining work.
+ * @param end stop processing command queue when we reach this point
+ *
+ * @return @ref MO_ALIVE if the nfa is still active with no matches pending,
+ *         and @ref MO_MATCHES_PENDING if there are matches pending, 0 if not
+ *         alive
+ *
+ * Note: if it can be determined that the stream can never match, the nfa
+ * may be reported as dead even if not all the data was scanned
+ *
+ * Note: if the nfa is not alive the state data is undefined
+ *
+ * Note: this function can not process events from the past: the location field
+ * of each event must be >= current offset.
+ */
+char nfaQueueExecToMatch(const struct NFA *nfa, struct mq *q, s64a end);
+
+/**
+ * Main execution function that doesn't perform the checks and optimisations of
+ * nfaQueueExecToMatch() and just dispatches directly to the nfa
+ * implementations. It is intended to be used by the Tamarama engine.
+ */
+char nfaQueueExec2_raw(const struct NFA *nfa, struct mq *q, s64a end);
+
+/**
+ * Report matches at the current queue location.
+ *
+ * @param nfa the NFA to execute
+ * @param q the queued commands. It must start with some variant of start and
+ *         end with some variant of end. The location field of the events must
+ *         be monotonically increasing.
+ *
+ * Note: the queue MUST be located at position where @ref nfaQueueExecToMatch
+ *       returned @ref MO_MATCHES_PENDING.
+ *
+ * Note: the return value of this call is undefined, and should be ignored.
+ */
+char nfaReportCurrentMatches(const struct NFA *nfa, struct mq *q);
+
+/**
+ * Returns non-zero if the NFA is in an accept state with the given report ID.
+ */
+char nfaInAcceptState(const struct NFA *nfa, ReportID report, struct mq *q);
+
+/**
+ * Returns non-zero if the NFA is in any accept state regardless of report
+ * ID.
+ */
+char nfaInAnyAcceptState(const struct NFA *nfa, struct mq *q);
+
+/**
+ * Process the queued commands on the given NFA up to end or the first match.
+ *
+ * Note: This version is meant for rose prefix/infix NFAs:
+ *  - never uses a callback
+ *  - loading of state at a point in history is not special cased
+ *
+ * @param nfa the NFA to execute
+ * @param q the queued commands. It must start with some variant of start and
+ *        end with some variant of end. The location field of the events must
+ *        be monotonically increasing. If not all the data was processed during
+ *        the call, the queue is updated to reflect the remaining work.
+ * @param report we are interested in. If the given report will be raised at
+ *        the end location, the function returns @ref MO_MATCHES_PENDING. If no
+ *        match information is desired, MO_INVALID_IDX should be passed in.
+ * @return @ref MO_ALIVE if the nfa is still active with no matches pending,
+ *         and @ref MO_MATCHES_PENDING if there are matches pending, 0 if not
+ *         alive
+ *
+ * Note: if it can be determined that the stream can never match, the nfa
+ * may be reported as dead even if not all the data was scanned
+ *
+ * Note: if the NFA is not active the state data is undefined.
+ */
+char nfaQueueExecRose(const struct NFA *nfa, struct mq *q, ReportID report);
+
+/**
+ * Runs an NFA in reverse from (buf + buflen) to buf and then from (hbuf + hlen)
+ * to hbuf (main buffer and history buffer).
+ *
+ * Note: provides the match location as the "end" offset when the callback is
+ * called.
+ *
+ * @param nfa engine to run
+ * @param offset base offset of buf
+ * @param buf main buffer
+ * @param buflen length of buf
+ * @param hbuf history buf
+ * @param hlen length of hbuf
+ * @param callback the callback to call for each match raised
+ * @param context context pointer passed to each callback
+ */
+char nfaBlockExecReverse(const struct NFA *nfa, u64a offset, const u8 *buf,
+                         size_t buflen, const u8 *hbuf, size_t hlen,
+                         NfaCallback callback, void *context);
+
+/**
+ * Check whether the given NFA's state indicates that it is in one or more
+ * final (accept at end of data) state. If so, call the callback for each
+ * match.
+ *
+ * @param nfa the NFA to execute
+ * @param state current state associated with this NFA
+ * @param streamState stream version of the state associated with this NFA
+ *        (including br region)
+ * @param offset the offset to return (via the callback) with each match
+ * @param callback the callback to call for each match raised
+ * @param context context pointer passed to each callback
+ *
+ * @return @ref MO_HALT_MATCHING if the user instructed us to halt, otherwise
+ *         @ref MO_CONTINUE_MATCHING.
+ */
+char nfaCheckFinalState(const struct NFA *nfa, const char *state,
+                        const char *streamState, u64a offset,
+                        NfaCallback callback, void *context);
+
+/**
+ * Indicates if an engine is a zombie.
+ *
+ * @param nfa engine to consider
+ * @param q queue corresponding to the engine
+ * @param loc current location in the buffer for an engine
+ */
+enum nfa_zombie_status nfaGetZombieStatus(const struct NFA *nfa, struct mq *q,
+                                          s64a loc);
+#ifdef __cplusplus
+}       /* extern "C" */
+#endif
+
+#endif
diff --git a/regex/nfa/nfa_api_dispatch.c b/regex/nfa/nfa_api_dispatch.c
new file mode 100644
index 000000000..75cac4b48
--- /dev/null
+++ b/regex/nfa/nfa_api_dispatch.c
@@ -0,0 +1,368 @@
+/*
+ * Copyright (c) 2015-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+    \brief Dispatches NFA engine API calls to the appropriate engines
+*/
+#include "nfa_api.h"
+
+#include "nfa_api_queue.h"
+#include "nfa_internal.h"
+#include "ue2common.h"
+
+// Engine implementations.
+#include "castle.h"
+#include "gough.h"
+#include "lbr.h"
+#include "limex.h"
+#include "mcclellan.h"
+#include "mcsheng.h"
+#include "mpv.h"
+#include "sheng.h"
+#include "tamarama.h"
+
+#define DISPATCH_CASE(dc_ltype, dc_ftype, dc_func_call)                        \
+    case dc_ltype:                                                             \
+        return nfaExec##dc_ftype##dc_func_call;                                \
+    break
+
+// general framework calls
+
+#define DISPATCH_BY_NFA_TYPE(dbnt_func)                                        \
+    switch (nfa->type) {                                                       \
+        DISPATCH_CASE(LIMEX_NFA_32, LimEx32, dbnt_func);                       \
+        DISPATCH_CASE(LIMEX_NFA_64, LimEx64, dbnt_func);                       \
+        DISPATCH_CASE(LIMEX_NFA_128, LimEx128, dbnt_func);                     \
+        DISPATCH_CASE(LIMEX_NFA_256, LimEx256, dbnt_func);                     \
+        DISPATCH_CASE(LIMEX_NFA_384, LimEx384, dbnt_func);                     \
+        DISPATCH_CASE(LIMEX_NFA_512, LimEx512, dbnt_func);                     \
+        DISPATCH_CASE(MCCLELLAN_NFA_8, McClellan8, dbnt_func);                 \
+        DISPATCH_CASE(MCCLELLAN_NFA_16, McClellan16, dbnt_func);               \
+        DISPATCH_CASE(GOUGH_NFA_8, Gough8, dbnt_func);                         \
+        DISPATCH_CASE(GOUGH_NFA_16, Gough16, dbnt_func);                       \
+        DISPATCH_CASE(MPV_NFA, Mpv, dbnt_func);                                \
+        DISPATCH_CASE(LBR_NFA_DOT, LbrDot, dbnt_func);                         \
+        DISPATCH_CASE(LBR_NFA_VERM, LbrVerm, dbnt_func);                       \
+        DISPATCH_CASE(LBR_NFA_NVERM, LbrNVerm, dbnt_func);                     \
+        DISPATCH_CASE(LBR_NFA_SHUF, LbrShuf, dbnt_func);                       \
+        DISPATCH_CASE(LBR_NFA_TRUF, LbrTruf, dbnt_func);                       \
+        DISPATCH_CASE(CASTLE_NFA, Castle, dbnt_func);                          \
+        DISPATCH_CASE(SHENG_NFA, Sheng, dbnt_func);                            \
+        DISPATCH_CASE(TAMARAMA_NFA, Tamarama, dbnt_func);                      \
+        DISPATCH_CASE(MCSHENG_NFA_8, McSheng8, dbnt_func);                     \
+        DISPATCH_CASE(MCSHENG_NFA_16, McSheng16, dbnt_func);                   \
+        DISPATCH_CASE(SHENG_NFA_32, Sheng32, dbnt_func);                       \
+        DISPATCH_CASE(SHENG_NFA_64, Sheng64, dbnt_func);                       \
+        DISPATCH_CASE(MCSHENG_64_NFA_8, McSheng64_8, dbnt_func);               \
+        DISPATCH_CASE(MCSHENG_64_NFA_16, McSheng64_16, dbnt_func);             \
+    default:                                                                   \
+        assert(0);                                                             \
+    }
+
+char nfaCheckFinalState(const struct NFA *nfa, const char *state,
+                        const char *streamState, u64a offset,
+                        NfaCallback callback, void *context) {
+    assert(ISALIGNED_CL(nfa) && ISALIGNED_CL(getImplNfa(nfa)));
+
+    // Caller should avoid calling us if we can never produce matches.
+    assert(nfaAcceptsEod(nfa));
+
+    DISPATCH_BY_NFA_TYPE(_testEOD(nfa, state, streamState, offset, callback,
+                                  context));
+    return 0;
+}
+
+char nfaQueueInitState(const struct NFA *nfa, struct mq *q) {
+    assert(ISALIGNED_CL(nfa) && ISALIGNED_CL(getImplNfa(nfa)));
+
+    DISPATCH_BY_NFA_TYPE(_queueInitState(nfa, q));
+    return 0;
+}
+
+static really_inline
+char nfaQueueExec_i(const struct NFA *nfa, struct mq *q, s64a end) {
+    DISPATCH_BY_NFA_TYPE(_Q(nfa, q, end));
+    return 0;
+}
+
+static really_inline
+char nfaQueueExec2_i(const struct NFA *nfa, struct mq *q, s64a end) {
+    DISPATCH_BY_NFA_TYPE(_Q2(nfa, q, end));
+    return 0;
+}
+
+char nfaQueueExec_raw(const struct NFA *nfa, struct mq *q, s64a end) {
+    return nfaQueueExec_i(nfa, q, end);
+}
+
+char nfaQueueExec2_raw(const struct NFA *nfa, struct mq *q, s64a end) {
+    return nfaQueueExec2_i(nfa, q, end);
+}
+
+static really_inline
+char nfaQueueExecRose_i(const struct NFA *nfa, struct mq *q, ReportID report) {
+    DISPATCH_BY_NFA_TYPE(_QR(nfa, q, report));
+    return 0;
+}
+
+/** Returns 0 if this NFA cannot possibly match (due to width constraints etc)
+ * and the caller should return 0. May also edit the queue. */
+static really_inline
+char nfaQueueCanMatch(const struct NFA *nfa, struct mq *q, s64a end,
+                      char *q_trimmed) {
+    assert(q_trimmed);
+    assert(q->end - q->cur >= 2);
+    assert(end >= 0);
+
+    DEBUG_PRINTF("q->offset=%llu, end=%lld\n", q->offset, end);
+    DEBUG_PRINTF("maxBiAnchoredWidth=%u, maxOffset=%u\n",
+                 nfa->maxBiAnchoredWidth, nfa->maxOffset);
+
+    if (nfa->maxBiAnchoredWidth &&
+            (end + q->offset > nfa->maxBiAnchoredWidth)) {
+        DEBUG_PRINTF("stream too long: o %llu l %zu max: %hhu\n", q->offset,
+                     q->length, nfa->maxBiAnchoredWidth);
+        return 0;
+    }
+
+    if (nfa->maxOffset) {
+        if (q->offset >= nfa->maxOffset) {
+            DEBUG_PRINTF("stream is past maxOffset\n");
+            return 0;
+        }
+
+        if (q->offset + end > nfa->maxOffset) {
+            s64a maxEnd = nfa->maxOffset - q->offset;
+            DEBUG_PRINTF("me %lld off %llu len = %lld\n", maxEnd,
+                         q->offset, end);
+            while (q->end > q->cur
+                   && q->items[q->end - 1].location > maxEnd) {
+                *q_trimmed = 1;
+                DEBUG_PRINTF("killing item %u %lld %u\n", q->end,
+                              q->items[q->end - 1].location,
+                              q->items[q->end - 1].type);
+                q->items[q->end - 1].location = maxEnd;
+                q->items[q->end - 1].type = MQE_END;
+                if (q->end - q->cur < 2
+                     ||q->items[q->end - 2].location <= maxEnd) {
+                    break;
+                }
+                q->end--;
+            }
+
+            if (q->end - q->cur < 2) { /* nothing left on q */
+                DEBUG_PRINTF("queue empty\n");
+                return 0;
+            }
+        }
+
+#ifdef DEBUG
+        if (*q_trimmed) {
+            debugQueue(q);
+        }
+#endif
+    }
+
+    return 1;
+}
+
+char nfaQueueExec(const struct NFA *nfa, struct mq *q, s64a end) {
+    DEBUG_PRINTF("nfa=%p end=%lld\n", nfa, end);
+#ifdef DEBUG
+    debugQueue(q);
+#endif
+
+    assert(q && q->context && q->state);
+    assert(end >= 0);
+    assert(q->cur < q->end);
+    assert(q->end <= MAX_MQE_LEN);
+    assert(ISALIGNED_CL(nfa) && ISALIGNED_CL(getImplNfa(nfa)));
+    assert(end < q->items[q->end - 1].location
+           || q->items[q->end - 1].type == MQE_END);
+
+    if (q->items[q->cur].location > end) {
+        return 1;
+    }
+
+    char q_trimmed = 0;
+
+    assert(end <= (s64a)q->length || !q->hlength);
+    /* due to reverse accel in block mode some queues may work on a truncated
+     * buffer */
+    if (end > (s64a)q->length) {
+        end = q->length;
+        q_trimmed = 1;
+    }
+
+    if (!nfaQueueCanMatch(nfa, q, end, &q_trimmed)) {
+        if (q->report_current) {
+            nfaReportCurrentMatches(nfa, q);
+            q->report_current = 0;
+        }
+
+        return 0;
+    }
+
+    char rv = nfaQueueExec_i(nfa, q, end);
+
+#ifdef DEBUG
+    debugQueue(q);
+#endif
+
+    assert(!q->report_current);
+    DEBUG_PRINTF("returned rv=%d, q_trimmed=%d\n", rv, q_trimmed);
+    return rv && !q_trimmed;
+}
+
+char nfaQueueExecToMatch(const struct NFA *nfa, struct mq *q, s64a end) {
+    DEBUG_PRINTF("nfa=%p end=%lld\n", nfa, end);
+#ifdef DEBUG
+    debugQueue(q);
+#endif
+
+    assert(q);
+    assert(end >= 0);
+    assert(q->state);
+    assert(q->cur < q->end);
+    assert(q->end <= MAX_MQE_LEN);
+    assert(ISALIGNED_CL(nfa) && ISALIGNED_CL(getImplNfa(nfa)));
+    assert(end < q->items[q->end - 1].location
+           || q->items[q->end - 1].type == MQE_END);
+
+    char q_trimmed_ra = 0;
+    assert(end <= (s64a)q->length || !q->hlength);
+    /* due to reverse accel in block mode some queues may work on a truncated
+     * buffer */
+    if (q->items[q->cur].location > end) {
+        return 1;
+    }
+
+    if (end > (s64a)q->length) {
+        end = q->length;
+        q_trimmed_ra = 1;
+    }
+
+    char q_trimmed = 0;
+    if (!nfaQueueCanMatch(nfa, q, end, &q_trimmed)) {
+        if (q->report_current) {
+            nfaReportCurrentMatches(nfa, q);
+            q->report_current = 0;
+        }
+
+        return 0;
+    }
+
+    char rv = nfaQueueExec2_i(nfa, q, end);
+    assert(!q->report_current);
+    DEBUG_PRINTF("returned rv=%d, q_trimmed=%d\n", rv, q_trimmed);
+    if (rv == MO_MATCHES_PENDING) {
+        if (q_trimmed) {
+            // We need to "fix" the queue so that subsequent operations must
+            // trim it as well.
+            assert(q->end > 0);
+            assert(nfa->maxOffset);
+            q->items[q->end - 1].location = nfa->maxOffset + 1;
+        }
+        return rv;
+    }
+    return rv && !q_trimmed && !q_trimmed_ra;
+}
+
+char nfaReportCurrentMatches(const struct NFA *nfa, struct mq *q) {
+    DISPATCH_BY_NFA_TYPE(_reportCurrent(nfa, q));
+    return 0;
+}
+
+char nfaInAcceptState(const struct NFA *nfa, ReportID report, struct mq *q) {
+    DISPATCH_BY_NFA_TYPE(_inAccept(nfa, report, q));
+    return 0;
+}
+
+char nfaInAnyAcceptState(const struct NFA *nfa, struct mq *q) {
+    DISPATCH_BY_NFA_TYPE(_inAnyAccept(nfa, q));
+    return 0;
+}
+
+char nfaQueueExecRose(const struct NFA *nfa, struct mq *q, ReportID r) {
+    DEBUG_PRINTF("nfa=%p\n", nfa);
+#ifdef DEBUG
+    debugQueue(q);
+#endif
+
+    assert(q && !q->context && q->state);
+    assert(q->cur <= q->end);
+    assert(q->end <= MAX_MQE_LEN);
+    assert(ISALIGNED_CL(nfa) && ISALIGNED_CL(getImplNfa(nfa)));
+    assert(!q->report_current);
+
+    return nfaQueueExecRose_i(nfa, q, r);
+}
+
+char nfaBlockExecReverse(const struct NFA *nfa, u64a offset, const u8 *buf,
+                         size_t buflen, const u8 *hbuf, size_t hlen,
+                         NfaCallback callback, void *context) {
+    assert(nfa);
+    assert(ISALIGNED_CL(nfa) && ISALIGNED_CL(getImplNfa(nfa)));
+
+    DISPATCH_BY_NFA_TYPE(_B_Reverse(nfa, offset, buf, buflen, hbuf, hlen,
+                                    callback, context));
+    return 0;
+}
+
+char nfaQueueCompressState(const struct NFA *nfa, const struct mq *q,
+                           s64a loc) {
+    assert(nfa && q);
+    assert(ISALIGNED_CL(nfa) && ISALIGNED_CL(getImplNfa(nfa)));
+
+    DISPATCH_BY_NFA_TYPE(_queueCompressState(nfa, q, loc));
+    return 0;
+}
+
+char nfaExpandState(const struct NFA *nfa, void *dest, const void *src,
+                    u64a offset, u8 key) {
+    assert(nfa && dest && src);
+    assert(ISALIGNED_CL(nfa) && ISALIGNED_CL(getImplNfa(nfa)));
+
+    DISPATCH_BY_NFA_TYPE(_expandState(nfa, dest, src, offset, key));
+    return 0;
+}
+
+char nfaInitCompressedState(const struct NFA *nfa, u64a offset, void *state,
+                            u8 key) {
+    assert(nfa && state);
+    assert(ISALIGNED_CL(nfa) && ISALIGNED_CL(getImplNfa(nfa)));
+
+    DISPATCH_BY_NFA_TYPE(_initCompressedState(nfa, offset, state, key));
+    return 0;
+}
+
+enum nfa_zombie_status nfaGetZombieStatus(const struct NFA *nfa, struct mq *q,
+                                          s64a loc) {
+    DISPATCH_BY_NFA_TYPE(_zombie_status(nfa, q, loc));
+    return NFA_ZOMBIE_NO;
+}
diff --git a/regex/nfa/nfa_api_queue.h b/regex/nfa/nfa_api_queue.h
new file mode 100644
index 000000000..e3579a7ee
--- /dev/null
+++ b/regex/nfa/nfa_api_queue.h
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef NFA_API_QUEUE_H
+#define NFA_API_QUEUE_H
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#include "ue2common.h"
+#include "callback.h"
+
+/** Size of mq::items, max elements on a queue. */
+#define MAX_MQE_LEN 10
+
+/** Queue events */
+
+/** Queue event: begin scanning. Note: stateless engines will start from this
+ * location. */
+#define MQE_START 0U
+
+/** Queue event: stop scanning. */
+#define MQE_END 1U
+
+/** Queue event: enable start and start-dot-star. */
+#define MQE_TOP 2U
+
+/** Queue event: first event corresponding to a numbered TOP. Additional tops
+ * (in multi-top engines) use the event values from MQE_TOP_FIRST to
+ * MQE_INVALID - 1. */
+#define MQE_TOP_FIRST 4U
+
+/** Invalid queue event */
+#define MQE_INVALID (~0U)
+
+/** Queue item */
+struct mq_item {
+    u32 type; /**< event type, from MQE_* */
+    s64a location; /**< relative to the start of the current buffer */
+    u64a som; /**< pattern start-of-match corresponding to a top, only used
+               * by som engines. */
+};
+
+// Forward decl.
+struct NFA;
+
+/**
+ * Queue of events to control engine execution.  mq::cur is index of first
+ * valid event, mq::end is one past the index of last valid event.
+ */
+struct mq {
+    const struct NFA *nfa; /**< nfa corresponding to the queue */
+    u32 cur; /**< index of the first valid item in the queue */
+    u32 end; /**< index one past the last valid item in the queue */
+    char *state; /**< uncompressed stream state; lives in scratch */
+    char *streamState; /**<
+                        * real stream state; used to access structures which
+                        * not duplicated the scratch state (bounded repeats,
+                        * etc) */
+    u64a offset; /**< base offset of the buffer */
+    const u8 *buffer; /**< buffer to scan */
+    size_t length; /**< length of buffer */
+    const u8 *history; /**<
+                        * history buffer; (logically) immediately before the
+                        * main buffer */
+    size_t hlength; /**< length of the history buffer */
+    struct hs_scratch *scratch; /**< global scratch space */
+    char report_current; /**<
+                          * report_current matches at starting offset through
+                          * callback. If true, the queue must be located at a
+                          * point where MO_MATCHES_PENDING was returned */
+    NfaCallback cb; /**< callback to trigger on matches */
+    void *context; /**< context to pass along with a callback */
+    struct mq_item items[MAX_MQE_LEN]; /**< queue items */
+};
+
+
+/**
+ * Pushes an (event, location, som) item onto a queue. If it is identical to the
+ * previous item on the queue, it is not added to the queue.
+ * @param q queue
+ * @param e event
+ * @param som som marker
+ * @param loc event location
+ */
+static really_inline
+void pushQueueSom(struct mq * restrict q, u32 e, s64a loc, u64a som) {
+    DEBUG_PRINTF("pushing %u@%lld -> %u [som = %llu]\n", e, loc, q->end, som);
+    assert(q->end < MAX_MQE_LEN);
+    assert(e < MQE_INVALID);
+/* stop gcc getting too smart for its own good */
+/*     assert(!q->end || q->items[q->end - 1].location <= loc); */
+    assert(q->end || e == MQE_START);
+
+    // Avoid duplicate items on the queue.
+    if (q->end) {
+        struct mq_item *item = &q->items[q->end - 1];
+        if (item->type == e && item->location == loc) {
+            DEBUG_PRINTF("dropping duplicate item\n");
+            LIMIT_TO_AT_MOST(&item->som, som); /* take lower som */
+            return;
+        }
+    }
+
+    u32 end = q->end;
+    struct mq_item *item = &q->items[end];
+    item->type = e;
+    item->location = loc;
+    item->som = som;
+    q->end = end + 1;
+}
+
+/**
+ * Pushes an (event, location) item onto a queue. If it is identical to the
+ * previous item on the queue, it is not added to the queue.
+ * @param q queue
+ * @param e event
+ * @param loc event location
+ */
+static really_inline
+void pushQueue(struct mq * restrict q, u32 e, s64a loc) {
+    pushQueueSom(q, e, loc, 0);
+}
+
+/**
+ * Pushes an (event, location) item onto a queue.
+ * This version of @ref pushQueue does not check to ensure that the item being
+ * added is not already on the queue. Used for events other than tops.
+ */
+static really_inline
+void pushQueueNoMerge(struct mq * restrict q, u32 e, s64a loc) {
+    DEBUG_PRINTF("pushing %u@%lld -> %u\n", e, loc, q->end);
+    assert(q->end < MAX_MQE_LEN);
+    assert(e < MQE_INVALID);
+/* stop gcc getting too smart for its own good */
+/*     assert(!q->end || q->items[q->end - 1].location <= loc); */
+    assert(q->end || e == MQE_START);
+
+#ifndef NDEBUG
+    // We assert that the event is different from its predecessor. If it's a
+    // dupe, you should have used the ordinary pushQueue call.
+    if (q->end) {
+        UNUSED struct mq_item *prev = &q->items[q->end - 1];
+        assert(prev->type != e || prev->location != loc);
+    }
+#endif
+
+    u32 end = q->end;
+    struct mq_item *item = &q->items[end];
+    item->type = e;
+    item->location = loc;
+    item->som = 0;
+    q->end = end + 1;
+}
+
+/** \brief Returns the type of the current queue event. */
+static really_inline u32 q_cur_type(const struct mq *q) {
+    assert(q->cur < q->end);
+    assert(q->cur < MAX_MQE_LEN);
+    return q->items[q->cur].type;
+}
+
+/** \brief Returns the location (relative to the beginning of the current data
+ * buffer) of the current queue event. */
+static really_inline s64a q_cur_loc(const struct mq *q) {
+    assert(q->cur < q->end);
+    assert(q->cur < MAX_MQE_LEN);
+    return q->items[q->cur].location;
+}
+
+/** \brief Returns the type of the last event in the queue. */
+static really_inline u32 q_last_type(const struct mq *q) {
+    assert(q->cur < q->end);
+    assert(q->end > 0);
+    assert(q->end <= MAX_MQE_LEN);
+    return q->items[q->end - 1].type;
+}
+
+/** \brief Returns the location (relative to the beginning of the current data
+ * buffer) of the last event in the queue. */
+static really_inline s64a q_last_loc(const struct mq *q) {
+    assert(q->cur < q->end);
+    assert(q->end > 0);
+    assert(q->end <= MAX_MQE_LEN);
+    return q->items[q->end - 1].location;
+}
+
+/** \brief Returns the absolute stream offset of the current queue event. */
+static really_inline u64a q_cur_offset(const struct mq *q) {
+    assert(q->cur < q->end);
+    assert(q->cur < MAX_MQE_LEN);
+    return q->offset + (u64a)q->items[q->cur].location;
+}
+
+/**
+ * \brief Removes all events in the queue before the given location.
+ */
+static really_inline
+void q_skip_forward_to(struct mq *q, s64a min_loc) {
+    assert(q->cur < q->end);
+    assert(q->cur < MAX_MQE_LEN);
+    assert(q->items[q->cur].type == MQE_START);
+
+    if (q_cur_loc(q) >= min_loc) {
+        DEBUG_PRINTF("all events >= loc %lld\n", min_loc);
+        return;
+    }
+
+    const u32 start_loc = q->cur;
+
+    do {
+        DEBUG_PRINTF("remove item with loc=%lld\n", q_cur_loc(q));
+        q->cur++;
+    } while (q->cur < q->end && q_cur_loc(q) < min_loc);
+
+    if (q->cur > start_loc) {
+        // Move original MQE_START item forward.
+        q->cur--;
+        q->items[q->cur] = q->items[start_loc];
+    }
+}
+
+#ifdef DEBUG
+// Dump the contents of the given queue.
+static never_inline UNUSED
+void debugQueue(const struct mq *q) {
+    DEBUG_PRINTF("q=%p, nfa=%p\n", q, q->nfa);
+    DEBUG_PRINTF("q offset=%llu, buf={%p, len=%zu}, history={%p, len=%zu}\n",
+                 q->offset, q->buffer, q->length, q->history, q->hlength);
+    DEBUG_PRINTF("q cur=%u, end=%u\n", q->cur, q->end);
+    for (u32 cur = q->cur; cur < q->end; cur++) {
+        const char *type = "UNKNOWN";
+        u32 e = q->items[cur].type;
+        switch (e) {
+        case MQE_START:
+            type = "MQE_START";
+            break;
+        case MQE_END:
+            type = "MQE_END";
+            break;
+        case MQE_TOP:
+            type = "MQE_TOP";
+            break;
+        case MQE_INVALID:
+            type = "MQE_INVALID";
+            break;
+        default:
+            assert(e >= MQE_TOP_FIRST && e < MQE_INVALID);
+            type = "MQE_TOP_N";
+            break;
+        }
+        DEBUG_PRINTF("\tq[%u] %lld %u:%s\n", cur, q->items[cur].location,
+                     q->items[cur].type, type);
+    }
+}
+#endif // DEBUG
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/regex/nfa/nfa_api_util.h b/regex/nfa/nfa_api_util.h
new file mode 100644
index 000000000..affc5f38f
--- /dev/null
+++ b/regex/nfa/nfa_api_util.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef NFA_API_UTIL_H
+#define NFA_API_UTIL_H
+
+#include "nfa_api_queue.h"
+#include "ue2common.h"
+
+/* returns the byte prior to the given location, NUL if not available */
+static really_inline
+u8 queue_prev_byte(const struct mq *q, s64a loc) {
+    if (loc <= 0) {
+        if (1LL - loc > (s64a)q->hlength) {
+            return 0; /* assume NUL for start of stream write */
+        }
+        // In the history buffer.
+        assert(q->history);
+        assert(q->hlength >= (u64a)(loc * -1));
+        return q->history[q->hlength - 1 + loc];
+    } else {
+        // In the stream write buffer.
+        assert(q->buffer);
+        assert(q->length >= (u64a)loc);
+        return q->buffer[loc - 1];
+    }
+}
+
+/* this is a modified version of pushQueue where we statically know the state of
+ * the queue. Does not attempt to merge and inserts at the given queue
+ * position. */
+static really_inline
+void pushQueueAt(struct mq * restrict q, u32 pos, u32 e, s64a loc) {
+    assert(pos == q->end);
+    DEBUG_PRINTF("pushing %u@%lld -> %u\n", e, loc, q->end);
+    assert(q->end < MAX_MQE_LEN);
+    assert(e < MQE_INVALID);
+/* stop gcc getting too smart for its own good */
+/*     assert(!q->end || q->items[q->end - 1].location <= loc); */
+    assert(q->end || e == MQE_START);
+
+#ifndef NDEBUG
+    // We assert that the event is different from its predecessor. If it's a
+    // dupe, you should have used the ordinary pushQueue call.
+    if (q->end) {
+        UNUSED struct mq_item *prev = &q->items[q->end - 1];
+        assert(prev->type != e || prev->location != loc);
+    }
+#endif
+
+    struct mq_item *item = &q->items[pos];
+    item->type = e;
+    item->location = loc;
+    item->som = 0;
+    q->end = pos + 1;
+}
+#endif
diff --git a/regex/nfa/nfa_internal.h b/regex/nfa/nfa_internal.h
new file mode 100644
index 000000000..ad27e28b1
--- /dev/null
+++ b/regex/nfa/nfa_internal.h
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2015-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+    \brief Declarations for the main NFA engine types and structures.
+*/
+#ifndef NFA_INTERNAL_H
+#define NFA_INTERNAL_H
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#include "ue2common.h"
+
+// Constants
+
+#define MO_INVALID_IDX  0xffffffff /**< index meaning value is invalid */
+
+// Flags (used in NFA::flags)
+
+#define NFA_ACCEPTS_EOD 1U     /**< can produce matches on EOD. */
+#define NFA_ZOMBIE      2U     /**< supports zombies */
+
+// Common data structures for NFAs
+
+enum NFAEngineType {
+    LIMEX_NFA_32,
+    LIMEX_NFA_64,
+    LIMEX_NFA_128,
+    LIMEX_NFA_256,
+    LIMEX_NFA_384,
+    LIMEX_NFA_512,
+    MCCLELLAN_NFA_8,    /**< magic pseudo nfa */
+    MCCLELLAN_NFA_16,   /**< magic pseudo nfa */
+    GOUGH_NFA_8,        /**< magic pseudo nfa */
+    GOUGH_NFA_16,       /**< magic pseudo nfa */
+    MPV_NFA,            /**< magic pseudo nfa */
+    LBR_NFA_DOT,        /**< magic pseudo nfa */
+    LBR_NFA_VERM,       /**< magic pseudo nfa */
+    LBR_NFA_NVERM,      /**< magic pseudo nfa */
+    LBR_NFA_SHUF,       /**< magic pseudo nfa */
+    LBR_NFA_TRUF,       /**< magic pseudo nfa */
+    CASTLE_NFA,         /**< magic pseudo nfa */
+    SHENG_NFA,          /**< magic pseudo nfa */
+    TAMARAMA_NFA,       /**< magic nfa container */
+    MCSHENG_NFA_8,      /**< magic pseudo nfa */
+    MCSHENG_NFA_16,     /**< magic pseudo nfa */
+    SHENG_NFA_32,       /**< magic pseudo nfa */
+    SHENG_NFA_64,       /**< magic pseudo nfa */
+    MCSHENG_64_NFA_8,   /**< magic pseudo nfa */
+    MCSHENG_64_NFA_16,  /**< magic pseudo nfa */
+    /** \brief bogus NFA - not used */
+    INVALID_NFA
+};
+
+/** \brief header for the NFA implementation. */
+struct ALIGN_CL_DIRECTIVE NFA {
+    u32 flags;
+
+    /** \brief The size in bytes of the NFA engine. The engine is
+     * serialized to the extent that copying length bytes back into a
+     * 16-byte aligned memory location yields a structure that has the same
+     * behaviour as the original engine. */
+    u32 length;
+
+    /** \brief Active implementation used by this NFAEngineType */
+    u8 type;
+
+    u8 rAccelType;
+    u8 rAccelOffset;
+    u8 maxBiAnchoredWidth; /**< if non zero, max width of the block */
+
+    union {
+        u8 c;
+        u16 dc;
+        u8 array[2];
+    } rAccelData;
+
+    u32 queueIndex; /**< index of the associated queue in scratch */
+
+    /** \brief The number of valid positions/states for this NFA. Debug only */
+    u32 nPositions;
+
+    /** \brief Size of the state required in scratch space.
+     *
+     * This state has less strict size requirements (as it doesn't go in stream
+     * state) and does not persist between stream writes.
+     */
+    u32 scratchStateSize;
+
+    /** \brief Size of the state required in stream state.
+     *
+     * This encompasses all state stored by the engine that must persist between
+     * stream writes. */
+    u32 streamStateSize;
+
+    u32 maxWidth; /**< longest possible match in this NFA, 0 if unbounded */
+    u32 minWidth; /**< minimum bytes required to match this NFA */
+    u32 maxOffset; /**< non zero: maximum offset this pattern can match at */
+
+    /* Note: implementation (e.g. a LimEx) directly follows struct in memory */
+} ;
+
+// Accessor macro for the implementation NFA: we do things this way to avoid
+// type-punning warnings.
+#define getImplNfa(nfa) \
+    ((const void *)((const char *)(nfa) + sizeof(struct NFA)))
+
+// Non-const version of the above, used at compile time.
+#define getMutableImplNfa(nfa)     ((char *)(nfa) + sizeof(struct NFA))
+
+static really_inline u32 nfaAcceptsEod(const struct NFA *nfa) {
+    return nfa->flags & NFA_ACCEPTS_EOD;
+}
+
+static really_inline u32 nfaSupportsZombie(const struct NFA *nfa) {
+    return nfa->flags & NFA_ZOMBIE;
+}
+
+/** \brief True if the given type (from NFA::type) is a McClellan DFA. */
+static really_inline int isMcClellanType(u8 t) {
+    return t == MCCLELLAN_NFA_8 || t == MCCLELLAN_NFA_16;
+}
+
+/** \brief True if the given type (from NFA::type) is a Sheng-McClellan hybrid
+ * DFA. */
+static really_inline int isShengMcClellanType(u8 t) {
+    return t == MCSHENG_NFA_8 || t == MCSHENG_NFA_16 ||
+           t == MCSHENG_64_NFA_8 || t == MCSHENG_64_NFA_16;
+}
+
+/** \brief True if the given type (from NFA::type) is a Gough DFA. */
+static really_inline int isGoughType(u8 t) {
+    return t == GOUGH_NFA_8 || t == GOUGH_NFA_16;
+}
+
+/** \brief True if the given type (from NFA::type) is a Sheng DFA. */
+static really_inline int isSheng16Type(u8 t) {
+    return t == SHENG_NFA;
+}
+
+/** \brief True if the given type (from NFA::type) is a Sheng32 DFA. */
+static really_inline int isSheng32Type(u8 t) {
+    return t == SHENG_NFA_32;
+}
+
+/** \brief True if the given type (from NFA::type) is a Sheng64 DFA. */
+static really_inline int isSheng64Type(u8 t) {
+    return t == SHENG_NFA_64;
+}
+
+/** \brief True if the given type (from NFA::type) is a Sheng16/32/64 DFA. */
+static really_inline int isShengType(u8 t) {
+    return t == SHENG_NFA || t == SHENG_NFA_32 || t == SHENG_NFA_64;
+}
+
+/**
+ * \brief True if the given type (from NFA::type) is a McClellan, Gough or
+ * Sheng DFA.
+ */
+static really_inline int isDfaType(u8 t) {
+    return isMcClellanType(t) || isGoughType(t) || isShengType(t)
+        || isShengMcClellanType(t);
+}
+
+static really_inline int isBigDfaType(u8 t) {
+    return t == MCCLELLAN_NFA_16 || t == MCSHENG_NFA_16 || t == GOUGH_NFA_16;
+}
+
+static really_inline int isSmallDfaType(u8 t) {
+    return isDfaType(t) && !isBigDfaType(t);
+}
+
+/** \brief True if the given type (from NFA::type) is an NFA. */
+static really_inline int isNfaType(u8 t) {
+    switch (t) {
+    case LIMEX_NFA_32:
+    case LIMEX_NFA_64:
+    case LIMEX_NFA_128:
+    case LIMEX_NFA_256:
+    case LIMEX_NFA_384:
+    case LIMEX_NFA_512:
+        return 1;
+    default:
+        break;
+    }
+    return 0;
+}
+
+/** \brief True if the given type (from NFA::type) is an LBR. */
+static really_inline
+int isLbrType(u8 t) {
+    return t == LBR_NFA_DOT || t == LBR_NFA_VERM || t == LBR_NFA_NVERM ||
+           t == LBR_NFA_SHUF || t == LBR_NFA_TRUF;
+}
+
+/** \brief True if the given type (from NFA::type) is a container engine. */
+static really_inline
+int isContainerType(u8 t) {
+    return t == TAMARAMA_NFA;
+}
+
+static really_inline
+int isMultiTopType(u8 t) {
+    return !isDfaType(t) && !isLbrType(t);
+}
+
+/** Macros used in place of unimplemented NFA API functions for a given
+ * engine. */
+#if !defined(_WIN32)
+
+/* Use for functions that return an integer. */
+#define NFA_API_NO_IMPL(...)                                                   \
+    ({                                                                         \
+        assert(!"not implemented for this engine!");                           \
+        0; /* return value, for places that need it */                         \
+    })
+
+/* Use for _zombie_status functions. */
+#define NFA_API_ZOMBIE_NO_IMPL(...)                                            \
+    ({                                                                         \
+        assert(!"not implemented for this engine!");                           \
+        NFA_ZOMBIE_NO;                                                         \
+    })
+
+#else
+
+/* Simpler implementation for compilers that don't like the GCC extension used
+ * above. */
+#define NFA_API_NO_IMPL(...)        0
+#define NFA_API_ZOMBIE_NO_IMPL(...) NFA_ZOMBIE_NO
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/regex/nfa/nfa_rev_api.h b/regex/nfa/nfa_rev_api.h
new file mode 100644
index 000000000..370f96ef6
--- /dev/null
+++ b/regex/nfa/nfa_rev_api.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Reverse-acceleration optimizations for the NFA API block mode scans.
+ */
+
+#ifndef NFA_REV_API_H
+#define NFA_REV_API_H
+
+#include "accel.h"
+#include "nfa_internal.h"
+#include "vermicelli.h"
+#include "util/unaligned.h"
+
+static really_inline
+size_t nfaRevAccel_i(const struct NFA *nfa, const u8 *buffer, size_t length) {
+    DEBUG_PRINTF("checking rev accel mw %u\n", nfa->minWidth);
+    assert(nfa->rAccelOffset >= 1);
+    assert(nfa->rAccelOffset <= nfa->minWidth);
+
+    const u8 *rv; // result for accel engine
+
+    switch (nfa->rAccelType) {
+    case ACCEL_RVERM:
+        DEBUG_PRINTF("ACCEL_RVERM\n");
+        if (length + 1 - nfa->rAccelOffset < 16) {
+            break;
+        }
+
+        rv = rvermicelliExec(nfa->rAccelData.c, 0, buffer,
+                             buffer + length + 1 - nfa->rAccelOffset);
+        length = (size_t)(rv - buffer + nfa->rAccelOffset);
+        break;
+    case ACCEL_RVERM_NOCASE:
+        DEBUG_PRINTF("ACCEL_RVERM_NOCASE\n");
+        if (length + 1 - nfa->rAccelOffset < 16) {
+            break;
+        }
+
+        rv = rvermicelliExec(nfa->rAccelData.c, 1, buffer,
+                             buffer + length + 1 - nfa->rAccelOffset);
+        length = (size_t)(rv - buffer + nfa->rAccelOffset);
+        break;
+    case ACCEL_RDVERM:
+        DEBUG_PRINTF("ACCEL_RDVERM\n");
+        if (length + 1 - nfa->rAccelOffset < 17) {
+            break;
+        }
+
+        rv = rvermicelliDoubleExec(nfa->rAccelData.array[0],
+                                   nfa->rAccelData.array[1], 0, buffer,
+                                   buffer + length + 1 - nfa->rAccelOffset);
+        length = (size_t)(rv - buffer + nfa->rAccelOffset);
+        break;
+    case ACCEL_RDVERM_NOCASE:
+        DEBUG_PRINTF("ACCEL_RVERM_NOCASE\n");
+        if (length + 1 - nfa->rAccelOffset < 17) {
+            break;
+        }
+
+        rv = rvermicelliDoubleExec(nfa->rAccelData.array[0],
+                                   nfa->rAccelData.array[1], 1, buffer,
+                                   buffer + length + 1 - nfa->rAccelOffset);
+        length = (size_t)(rv - buffer + nfa->rAccelOffset);
+        break;
+    case ACCEL_REOD:
+        DEBUG_PRINTF("ACCEL_REOD\n");
+        if (buffer[length - nfa->rAccelOffset] != nfa->rAccelData.c) {
+            return 0;
+        }
+        break;
+    case ACCEL_REOD_NOCASE:
+        DEBUG_PRINTF("ACCEL_REOD_NOCASE\n");
+        if ((buffer[length - nfa->rAccelOffset] & CASE_CLEAR) !=
+            nfa->rAccelData.c) {
+            return 0;
+        }
+        break;
+    case ACCEL_RDEOD:
+        DEBUG_PRINTF("ACCEL_RDEOD\n");
+        if (unaligned_load_u16(buffer + length - nfa->rAccelOffset) !=
+                nfa->rAccelData.dc) {
+            return 0;
+        }
+        break;
+    case ACCEL_RDEOD_NOCASE:
+        DEBUG_PRINTF("ACCEL_RDEOD_NOCASE\n");
+        if ((unaligned_load_u16(buffer + length - nfa->rAccelOffset) &
+             DOUBLE_CASE_CLEAR) != nfa->rAccelData.dc) {
+            return 0;
+        }
+        break;
+    default:
+        assert(!"not here");
+    }
+
+    if (nfa->minWidth > length) {
+        DEBUG_PRINTF("post-accel, scan skipped: %zu < min %u bytes\n", length,
+                     nfa->minWidth);
+        return 0;
+    }
+
+    return length;
+}
+
+/** \brief Reverse acceleration check. Returns a new length for the block,
+ * guaranteeing that a match cannot occur beyond that point. */
+static really_inline
+size_t nfaRevAccelCheck(const struct NFA *nfa, const u8 *buffer,
+                        size_t length) {
+    assert(nfa);
+
+    // If this block is not long enough to satisfy the minimum width
+    // constraint on this NFA, we can avoid the scan altogether.
+    if (nfa->minWidth > length) {
+        DEBUG_PRINTF("scan skipped: %zu < min %u bytes\n", length,
+                     nfa->minWidth);
+        return 0;
+    }
+
+    if (nfa->rAccelType == ACCEL_NONE) {
+        DEBUG_PRINTF("no rev accel available\n");
+        return length;
+    }
+
+    size_t rv_length = nfaRevAccel_i(nfa, buffer, length);
+    assert(rv_length <= length);
+    return rv_length;
+}
+
+#endif
diff --git a/regex/nfa/repeat.c b/regex/nfa/repeat.c
new file mode 100644
index 000000000..946459415
--- /dev/null
+++ b/regex/nfa/repeat.c
@@ -0,0 +1,1611 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief API for handling bounded repeats.
+ *
+ * This file provides an internal API for handling bounded repeats of character
+ * classes. It is used by the Large Bounded Repeat (LBR) engine and by the
+ * bounded repeat handling in the LimEx NFA engine as well.
+ */
+#include "repeat.h"
+#include "util/bitutils.h"
+#include "util/multibit.h"
+#include "util/pack_bits.h"
+#include "util/partial_store.h"
+#include "util/unaligned.h"
+
+#ifndef __KERNEL__
+#include <stdint.h>
+#include <string.h>
+#else
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/limits.h>
+#define UINT32_MAX U32_MAX
+#endif
+
+/** \brief Returns the total capacity of the ring.
+ * Note that it's currently one greater than repeatMax so that we can handle
+ * cases where the tug and pos triggers overlap. */
+static
+u32 ringCapacity(const struct RepeatInfo *info) {
+    return info->repeatMax + 1;
+}
+
+/** \brief Returns the number of elements currently in the ring. Note that if
+ * the first and last indices are equal, the ring is full. */
+static
+u32 ringOccupancy(const struct RepeatRingControl *xs, const u32 ringSize) {
+    if (xs->last > xs->first) {
+        return xs->last - xs->first;
+    } else { // wrapped
+        return ringSize - (xs->first - xs->last);
+    }
+}
+
+/** \brief Returns the offset of the _last_ top stored in the ring. */
+static
+u64a ringLastTop(const struct RepeatRingControl *xs, const u32 ringSize) {
+    return xs->offset + ringOccupancy(xs, ringSize) - 1;
+}
+
+#if !defined(NDEBUG) || defined(DUMP_SUPPORT)
+/** \brief For debugging: returns the total capacity of the range list. */
+static UNUSED
+u32 rangeListCapacity(const struct RepeatInfo *info) {
+    u32 d = info->repeatMax - info->repeatMin;
+    assert(d > 0); // should be in a RING model!
+    return 2 * ((info->repeatMax / d) + 1);
+}
+#endif
+
+#ifdef DEBUG
+static
+void dumpRing(const struct RepeatInfo *info, const struct RepeatRingControl *xs,
+              const u8 *ring) {
+    const u32 ringSize = ringCapacity(info);
+    DEBUG_PRINTF("ring (occ %u/%u, %u->%u): ", ringOccupancy(xs, ringSize),
+                 ringSize, xs->first, xs->last);
+
+    u16 i = xs->first, n = 0;
+    do {
+        if (mmbit_isset(ring, ringSize, i)) {
+            u64a ringOffset = xs->offset + n;
+            printf("%llu ", ringOffset);
+        }
+        ++i, ++n;
+        if (i == ringSize) {
+            i = 0;
+        }
+    } while (i != xs->last);
+    printf("\n");
+}
+
+static
+void dumpRange(const struct RepeatInfo *info,
+               const struct RepeatRangeControl *xs, const u16 *ring) {
+    const u32 ringSize = rangeListCapacity(info);
+    DEBUG_PRINTF("ring (occ %u/%u): ", xs->num, ringSize);
+
+    if (xs->num) {
+        for (u32 i = 0; i < xs->num; i++) {
+            printf("%llu ", xs->offset + unaligned_load_u16(ring + i));
+        }
+    } else {
+        printf("empty");
+    }
+    printf("\n");
+}
+
+static
+void dumpBitmap(const struct RepeatBitmapControl *xs) {
+    DEBUG_PRINTF("bitmap (base=%llu): ", xs->offset);
+    u64a bitmap = xs->bitmap;
+    while (bitmap) {
+        printf("%llu ", xs->offset + findAndClearLSB_64(&bitmap));
+    }
+    printf("\n");
+}
+
+static
+void dumpTrailer(const struct RepeatInfo *info,
+                 const struct RepeatTrailerControl *xs) {
+    const u64a m_width = info->repeatMax - info->repeatMin;
+    DEBUG_PRINTF("trailer: current extent is [%llu,%llu]", xs->offset,
+                 xs->offset + m_width);
+    u64a bitmap = xs->bitmap;
+    if (bitmap) {
+        printf(", also matches at: ");
+        while (bitmap) {
+            u32 idx = findAndClearMSB_64(&bitmap);
+            printf("%llu ", xs->offset - idx - 1);
+        }
+    } else {
+        printf(", no earlier matches");
+    }
+    printf("\n");
+}
+
+#endif // DEBUG
+
+#ifndef NDEBUG
+/** \brief For debugging: returns true if the range is ordered with no dupes. */
+static UNUSED
+int rangeListIsOrdered(const struct RepeatRangeControl *xs, const u16 *ring) {
+    for (u32 i = 1; i < xs->num; i++) {
+        u16 a = unaligned_load_u16(ring + i - 1);
+        u16 b = unaligned_load_u16(ring + i);
+        if (a >= b) {
+            return 0;
+        }
+    }
+    return 1;
+}
+#endif
+
+u64a repeatLastTopRing(const struct RepeatInfo *info,
+                       const union RepeatControl *ctrl) {
+    const u32 ringSize = ringCapacity(info);
+    return ringLastTop(&ctrl->ring, ringSize);
+}
+
+u64a repeatLastTopRange(const union RepeatControl *ctrl, const void *state) {
+    const u16 *ring = (const u16 *)state;
+    const struct RepeatRangeControl *xs = &ctrl->range;
+    assert(xs->num);
+    return xs->offset + unaligned_load_u16(ring + xs->num - 1);
+}
+
+u64a repeatLastTopBitmap(const union RepeatControl *ctrl) {
+    const struct RepeatBitmapControl *xs = &ctrl->bitmap;
+    if (!xs->bitmap) {
+        /* last top was too long ago */
+        return 0;
+    }
+    return xs->offset + 63 - clz64(xs->bitmap);
+}
+
+u64a repeatLastTopTrailer(const struct RepeatInfo *info,
+                          const union RepeatControl *ctrl) {
+    const struct RepeatTrailerControl *xs = &ctrl->trailer;
+    assert(xs->offset >= info->repeatMin);
+    return xs->offset - info->repeatMin;
+}
+
+u64a repeatNextMatchRing(const struct RepeatInfo *info,
+                         const union RepeatControl *ctrl, const void *state,
+                         u64a offset) {
+    const struct RepeatRingControl *xs = &ctrl->ring;
+    const u8 *ring = (const u8 *)state;
+    const u32 ringSize = ringCapacity(info);
+
+    // We should have at least one top stored.
+    assert(mmbit_any(ring, ringSize));
+    assert(info->repeatMax < REPEAT_INF);
+
+    // Increment offset, as we want the NEXT match.
+    offset++;
+
+    const u64a base_offset = xs->offset;
+    DEBUG_PRINTF("offset=%llu, base_offset=%llu\n", offset, base_offset);
+
+    u64a delta = offset - base_offset;
+    if (offset < base_offset || delta < info->repeatMin) {
+        DEBUG_PRINTF("before min repeat\n");
+        return base_offset + info->repeatMin;
+    }
+    if (offset > ringLastTop(xs, ringSize) + info->repeatMax) {
+        DEBUG_PRINTF("ring is stale\n");
+        return 0; // no more matches
+    }
+
+    DEBUG_PRINTF("delta=%llu\n", delta);
+    u64a lower = delta > info->repeatMax ? delta - info->repeatMax : 0;
+    DEBUG_PRINTF("lower=%llu\n", lower);
+
+    assert(lower < ringSize);
+
+    // First scan, either to xs->last if there's no wrap-around or ringSize
+    // (end of the underlying multibit) if we are wrapping.
+
+    u32 begin = xs->first + lower;
+    if (begin >= ringSize) {
+        // This branch and sub tested a lot faster than using % (integer div).
+        begin -= ringSize;
+    }
+    const u32 end = begin >= xs->last ? ringSize : xs->last;
+    u32 i = mmbit_iterate_bounded(ring, ringSize, begin, end);
+    if (i != MMB_INVALID) {
+        u32 j = i - begin + lower;
+        return MAX(offset, base_offset + j + info->repeatMin);
+    }
+
+    // A second scan is necessary if we need to cope with wrap-around in the
+    // ring buffer.
+
+    if (begin >= xs->last) {
+        i = mmbit_iterate_bounded(ring, ringSize, 0, xs->last);
+        if (i != MMB_INVALID) {
+            u32 j = i + (ringSize - begin) + lower;
+            return MAX(offset, base_offset + j + info->repeatMin);
+        }
+    }
+
+    return 0;
+}
+
+u64a repeatNextMatchRange(const struct RepeatInfo *info,
+                          const union RepeatControl *ctrl, const void *state,
+                          u64a offset) {
+    const struct RepeatRangeControl *xs = &ctrl->range;
+    const u16 *ring = (const u16 *)state;
+
+    assert(xs->num > 0);
+    assert(xs->num <= rangeListCapacity(info));
+    assert(rangeListIsOrdered(xs, ring));
+    assert(info->repeatMax < REPEAT_INF);
+
+    for (u32 i = 0; i < xs->num; i++) {
+        u64a base = xs->offset + unaligned_load_u16(ring + i);
+        u64a first = base + info->repeatMin;
+        if (offset < first) {
+            return first;
+        }
+        if (offset < base + info->repeatMax) {
+            return offset + 1;
+        }
+    }
+
+    return 0;
+}
+
+u64a repeatNextMatchBitmap(const struct RepeatInfo *info,
+                           const union RepeatControl *ctrl, u64a offset) {
+    const struct RepeatBitmapControl *xs = &ctrl->bitmap;
+    const u64a base = xs->offset;
+    u64a bitmap = xs->bitmap;
+
+    // FIXME: quick exit if there is no match, based on last top in bitmap?
+
+    while (bitmap) {
+        u64a top = base + findAndClearLSB_64(&bitmap);
+        if (offset < top + info->repeatMin) {
+            return top + info->repeatMin;
+        }
+        if (offset < top + info->repeatMax) {
+            return offset + 1;
+        }
+    }
+
+    return 0; // No more matches.
+}
+
+u64a repeatNextMatchTrailer(const struct RepeatInfo *info,
+                            const union RepeatControl *ctrl, u64a offset) {
+    const struct RepeatTrailerControl *xs = &ctrl->trailer;
+    const u32 m_width = info->repeatMax - info->repeatMin;
+
+    DEBUG_PRINTF("offset=%llu, xs->offset=%llu\n", offset, xs->offset);
+    DEBUG_PRINTF("{%u,%u} repeat, m_width=%u\n", info->repeatMin,
+                 info->repeatMax, m_width);
+
+    assert(xs->offset >= info->repeatMin);
+
+    if (offset >= xs->offset + m_width) {
+        DEBUG_PRINTF("no more matches\n");
+        return 0;
+    }
+
+    if (offset >= xs->offset) {
+        DEBUG_PRINTF("inside most recent match window, next match %llu\n",
+                     offset + 1);
+        return offset + 1;
+    }
+
+    // Offset is before the match window, we need to consult the bitmap of
+    // earlier match offsets.
+    u64a bitmap = xs->bitmap;
+
+    u64a diff = xs->offset - offset;
+    DEBUG_PRINTF("diff=%llu\n", diff);
+    if (diff <= 64) {
+        assert(diff);
+        bitmap &= (1ULL << (diff - 1)) - 1;
+    }
+    DEBUG_PRINTF("bitmap = 0x%llx\n", bitmap);
+    if (bitmap) {
+        u32 idx = 63 - clz64(bitmap);
+        DEBUG_PRINTF("clz=%u, idx = %u -> offset %llu\n", clz64(bitmap), idx,
+                     xs->offset - idx);
+        DEBUG_PRINTF("next match at %llu\n", xs->offset - idx - 1);
+        u64a next_match = xs->offset - idx - 1;
+        assert(next_match > offset);
+        return next_match;
+    }
+
+    DEBUG_PRINTF("next match is start of match window, %llu\n", xs->offset);
+    return xs->offset;
+}
+
+/** \brief Store the first top in the ring buffer. */
+static
+void storeInitialRingTop(struct RepeatRingControl *xs, u8 *ring,
+                         u64a offset, const u32 ringSize) {
+    DEBUG_PRINTF("ring=%p, ringSize=%u\n", ring, ringSize);
+    xs->offset = offset;
+    mmbit_clear(ring, ringSize);
+    mmbit_set(ring, ringSize, 0);
+    xs->first = 0;
+    xs->last = 1;
+}
+
+static really_inline
+char ringIsStale(const struct RepeatRingControl *xs, const u32 ringSize,
+                 const u64a offset) {
+    u64a finalMatch = ringLastTop(xs, ringSize);
+    if (offset - finalMatch >= ringSize) {
+        DEBUG_PRINTF("all matches in ring are stale\n");
+        return 1;
+    }
+
+    return 0;
+}
+
+void repeatStoreRing(const struct RepeatInfo *info, union RepeatControl *ctrl,
+                     void *state, u64a offset, char is_alive) {
+    struct RepeatRingControl *xs = &ctrl->ring;
+    u8 *ring = (u8 *)state;
+    const u32 ringSize = ringCapacity(info);
+    assert(ringSize > 0);
+
+    DEBUG_PRINTF("storing top for offset %llu in ring\n", offset);
+
+    if (!is_alive || ringIsStale(xs, ringSize, offset)) {
+        storeInitialRingTop(xs, ring, offset, ringSize);
+    } else {
+        assert(offset > ringLastTop(xs, ringSize)); // Dupe or out of order.
+        u32 occ = ringOccupancy(xs, ringSize);
+        u64a diff = offset - xs->offset;
+        DEBUG_PRINTF("diff=%llu, occ=%u\n", diff, occ);
+        if (diff >= ringSize) {
+            u32 push = diff - ringSize + 1;
+            DEBUG_PRINTF("push ring %u\n", push);
+            xs->first += push;
+            if (xs->first >= ringSize) {
+                xs->first -= ringSize;
+            }
+            xs->offset += push;
+            diff -= push;
+            occ -= push;
+        }
+
+        // There's now room in the ring for this top, so we write a run of
+        // zeroes, then a one.
+        DEBUG_PRINTF("diff=%llu, occ=%u\n", diff, occ);
+        assert(diff < ringSize);
+        assert(diff >= occ);
+        u32 n = diff - occ;
+
+        u32 i = xs->last + n;
+
+        mmbit_unset_range(ring, ringSize, xs->last, MIN(i, ringSize));
+        if (i >= ringSize) {
+            i -= ringSize;
+            mmbit_unset_range(ring, ringSize, 0, i);
+        }
+
+        assert(i != xs->first);
+        DEBUG_PRINTF("set bit %u\n", i);
+        mmbit_set(ring, ringSize, i);
+        xs->last = i + 1;
+        if (xs->last == ringSize) {
+            xs->last = 0;
+        }
+    }
+
+    // Our ring indices shouldn't have spiraled off into uncharted space.
+    assert(xs->first < ringSize);
+    assert(xs->last < ringSize);
+
+#ifdef DEBUG
+    DEBUG_PRINTF("post-store ring state\n");
+    dumpRing(info, xs, ring);
+#endif
+
+    // The final top stored in our ring should be the one we just wrote in.
+    assert(ringLastTop(xs, ringSize) == offset);
+}
+
+static really_inline
+void storeInitialRangeTop(struct RepeatRangeControl *xs, u16 *ring,
+                          u64a offset) {
+    xs->offset = offset;
+    xs->num = 1;
+    unaligned_store_u16(ring, 0);
+}
+
+void repeatStoreRange(const struct RepeatInfo *info, union RepeatControl *ctrl,
+                      void *state, u64a offset, char is_alive) {
+    struct RepeatRangeControl *xs = &ctrl->range;
+    u16 *ring = (u16 *)state;
+
+    if (!is_alive) {
+        DEBUG_PRINTF("storing initial top at %llu\n", offset);
+        storeInitialRangeTop(xs, ring, offset);
+        return;
+    }
+
+    DEBUG_PRINTF("storing top at %llu, list currently has %u/%u elements\n",
+                 offset, xs->num, rangeListCapacity(info));
+
+#ifdef DEBUG
+    dumpRange(info, xs, ring);
+#endif
+
+    // Walk ring from front. Identify the number of stale elements, and shift
+    // the whole ring to delete them.
+    u32 i = 0;
+    for (; i < xs->num; i++) {
+        u64a this_offset = xs->offset + unaligned_load_u16(ring + i);
+        DEBUG_PRINTF("this_offset=%llu, diff=%llu\n", this_offset,
+                     offset - this_offset);
+        if (offset - this_offset <= info->repeatMax) {
+            break;
+        }
+    }
+
+    if (i == xs->num) {
+        DEBUG_PRINTF("whole ring is stale\n");
+        storeInitialRangeTop(xs, ring, offset);
+        return;
+    } else if (i > 0) {
+        DEBUG_PRINTF("expiring %u stale tops\n", i);
+        u16 first_offset = unaligned_load_u16(ring + i); // first live top
+        for (u32 j = 0; j < xs->num - i; j++) {
+            u16 val = unaligned_load_u16(ring + i + j);
+            assert(val >= first_offset);
+            unaligned_store_u16(ring + j, val - first_offset);
+        }
+        xs->offset += first_offset;
+        xs->num -= i;
+    }
+
+#ifdef DEBUG
+    DEBUG_PRINTF("post-expire:\n");
+    dumpRange(info, xs, ring);
+#endif
+
+    if (xs->num == 1) {
+        goto append;
+    }
+
+    // Let d = repeatMax - repeatMin
+    // Examine penultimate entry x[-2].
+    // If (offset - x[-2] <= d), then last entry x[-1] can be replaced with
+    // entry for offset.
+    assert(xs->num >= 2);
+    u32 d = info->repeatMax - info->repeatMin;
+    u64a penultimate_offset =
+        xs->offset + unaligned_load_u16(ring + xs->num - 2);
+    if (offset - penultimate_offset <= d) {
+        assert(offset - xs->offset <= (u16)-1);
+        unaligned_store_u16(ring + xs->num - 1, offset - xs->offset);
+        goto done;
+    }
+
+    // Otherwise, write a new entry for offset and return.
+
+append:
+    assert(offset - xs->offset <= (u16)-1);
+    assert(xs->num < rangeListCapacity(info));
+    unaligned_store_u16(ring + xs->num, offset - xs->offset);
+    xs->num++;
+
+done:
+    assert(rangeListIsOrdered(xs, ring));
+}
+
+void repeatStoreBitmap(const struct RepeatInfo *info, union RepeatControl *ctrl,
+                       u64a offset, char is_alive) {
+    DEBUG_PRINTF("{%u,%u} repeat, storing top at %llu\n", info->repeatMin,
+                 info->repeatMax, offset);
+
+    struct RepeatBitmapControl *xs = &ctrl->bitmap;
+    if (!is_alive || !xs->bitmap) {
+        DEBUG_PRINTF("storing initial top at %llu\n", offset);
+        xs->offset = offset;
+        xs->bitmap = 1U;
+        return;
+    }
+
+#ifdef DEBUG
+    DEBUG_PRINTF("pre-store:\n");
+    dumpBitmap(xs);
+#endif
+
+    assert(offset >= xs->offset);
+
+    u64a last_top = xs->offset + 63 - clz64(xs->bitmap);
+    if (offset > last_top + info->repeatMax) {
+        DEBUG_PRINTF("bitmap stale, storing initial top\n");
+        xs->offset = offset;
+        xs->bitmap = 1U;
+        return;
+    }
+
+    u64a diff = offset - xs->offset;
+    if (diff >= info->repeatMax + 1) {
+        DEBUG_PRINTF("need expire, diff=%llu\n", diff);
+        u64a push = diff - info->repeatMax;
+        xs->offset += push;
+        xs->bitmap = push >= 64 ? 0 : xs->bitmap >> push;
+        DEBUG_PRINTF("pushed xs->offset to %llu\n", xs->offset);
+    }
+
+    // Write a new entry.
+    diff = offset - xs->offset;
+    assert(diff < 64);
+    xs->bitmap |= (1ULL << diff);
+
+#ifdef DEBUG
+    DEBUG_PRINTF("post-store:\n");
+    dumpBitmap(xs);
+#endif
+}
+
+/** \brief Returns 1 if the ring has a match between (logical) index \a lower
+ * and \a upper, excluding \a upper. */
+static
+int ringHasMatch(const struct RepeatRingControl *xs, const u8 *ring,
+                 const u32 ringSize, u32 lower, u32 upper) {
+    assert(lower < upper);
+    assert(lower < ringSize);
+    assert(upper <= ringSize);
+
+    u32 i = xs->first + lower;
+    if (i >= ringSize) {
+        i -= ringSize;
+    }
+
+    // Performance tweak: if we're looking at a fixed repeat, we can just use
+    // mmbit_isset.
+    if (lower + 1 == upper) {
+        return mmbit_isset(ring, ringSize, i);
+    }
+
+    u32 end = xs->first + upper;
+    if (end >= ringSize) {
+        end -= ringSize;
+    }
+
+    // First scan, either to end if there's no wrap-around or ringSize (end of
+    // the underlying multibit) if we are wrapping.
+
+    u32 scan_end = i < end ? end : ringSize;
+    u32 m = mmbit_iterate_bounded(ring, ringSize, i, scan_end);
+    if (m != MMB_INVALID) {
+        return 1;
+    }
+
+    // A second scan is necessary if we need to cope with wrap-around in the
+    // ring buffer.
+
+    if (i >= end) {
+        m = mmbit_iterate_bounded(ring, ringSize, 0, end);
+        return m != MMB_INVALID;
+    }
+
+    return 0;
+}
+
+/** Return a mask of ones in bit positions [0..v]. */
+static really_inline
+u64a mask_ones_to(u32 v) {
+    if (v < 63) {
+        return (1ULL << (v + 1)) - 1;
+    } else {
+        return ~(0ULL);
+    }
+}
+
+void repeatStoreTrailer(const struct RepeatInfo *info,
+                        union RepeatControl *ctrl, u64a offset, char is_alive) {
+    DEBUG_PRINTF("{%u,%u} repeat, top at %llu\n", info->repeatMin,
+                 info->repeatMax, offset);
+
+    struct RepeatTrailerControl *xs = &ctrl->trailer;
+
+    /* The TRAILER repeat model stores the following data in its control block:
+     *
+     *   1. offset, which is the min extent of the most recent match window
+     *      (i.e. corresponding to the most recent top)
+     *   2. bitmap, which is a bitmap of up to repeatMin matches before
+     *      the min extent offset.
+     */
+
+    const u64a next_extent = offset + info->repeatMin;
+
+    if (!is_alive) {
+        xs->offset = next_extent;
+        xs->bitmap = 0;
+        DEBUG_PRINTF("initial top, set extent to %llu\n", next_extent);
+        return;
+    }
+
+#ifdef DEBUG
+    DEBUG_PRINTF("pre-store:\n");
+    dumpTrailer(info, xs);
+#endif
+
+    const u32 m_width = info->repeatMax - info->repeatMin;
+    DEBUG_PRINTF("most recent match window is [%llu,%llu]\n", xs->offset,
+                 xs->offset + m_width);
+
+    assert(next_extent > xs->offset);
+    u64a diff = next_extent - xs->offset;
+    DEBUG_PRINTF("diff=%llu, m_width=%u\n", diff, m_width);
+
+    assert(diff);
+    xs->bitmap = diff < 64 ? xs->bitmap << diff : 0;
+
+    // Switch on bits in the bitmask corresponding to matches in the previous
+    // match window.
+    if (diff <= m_width) {
+        u64a m = mask_ones_to(diff - 1);
+        xs->bitmap |= m;
+    } else {
+        u64a shift = diff - m_width - 1;
+        if (shift < 64) {
+            u64a m = mask_ones_to(m_width);
+            m <<= shift;
+            xs->bitmap |= m;
+        }
+    }
+
+    DEBUG_PRINTF("bitmap=0x%llx\n", xs->bitmap);
+
+    // Update max extent.
+    xs->offset = next_extent;
+
+    // Trim stale history: we only need repeatMin bytes of history.
+    if (info->repeatMin < 63) {
+        u64a mask = (1ULL << (info->repeatMin + 1)) - 1;
+        xs->bitmap &= mask;
+    }
+
+#ifdef DEBUG
+    DEBUG_PRINTF("post-store:\n");
+    dumpTrailer(info, xs);
+#endif
+}
+
+enum RepeatMatch repeatHasMatchRing(const struct RepeatInfo *info,
+                                    const union RepeatControl *ctrl,
+                                    const void *state, u64a offset) {
+    const struct RepeatRingControl *xs = &ctrl->ring;
+    const u8 *ring = (const u8 *)state;
+    const u32 ringSize = ringCapacity(info);
+
+    assert(mmbit_any(ring, ringSize));
+    assert(offset >= xs->offset);
+
+    DEBUG_PRINTF("check: offset=%llu, repeat=[%u,%u]\n", offset,
+                 info->repeatMin, info->repeatMax);
+#ifdef DEBUG
+    DEBUG_PRINTF("ring state\n");
+    dumpRing(info, xs, ring);
+#endif
+
+    if (offset - xs->offset < info->repeatMin) {
+        DEBUG_PRINTF("haven't even seen repeatMin bytes yet!\n");
+        return REPEAT_NOMATCH;
+    }
+
+    if (offset - ringLastTop(xs, ringSize) >= ringSize) {
+        DEBUG_PRINTF("ring is stale\n");
+        return REPEAT_STALE;
+    }
+
+    // If we're not stale, delta fits in the range [repeatMin, lastTop +
+    // repeatMax], which fits in a u32.
+    assert(offset - xs->offset < UINT32_MAX);
+    u32 delta = (u32)(offset - xs->offset);
+    DEBUG_PRINTF("delta=%u\n", delta);
+
+    // Find the bounds on possible matches in the ring buffer.
+    u32 lower = delta > info->repeatMax ? delta - info->repeatMax : 0;
+    u32 upper = MIN(delta - info->repeatMin + 1, ringOccupancy(xs, ringSize));
+
+    if (lower >= upper) {
+        DEBUG_PRINTF("no matches to check\n");
+        return REPEAT_NOMATCH;
+    }
+
+    DEBUG_PRINTF("possible match indices=[%u,%u]\n", lower, upper);
+    if (ringHasMatch(xs, ring, ringSize, lower, upper)) {
+        return REPEAT_MATCH;
+    }
+
+    return REPEAT_NOMATCH;
+}
+
+enum RepeatMatch repeatHasMatchRange(const struct RepeatInfo *info,
+                                     const union RepeatControl *ctrl,
+                                     const void *state, u64a offset) {
+    const struct RepeatRangeControl *xs = &ctrl->range;
+    const u16 *ring = (const u16 *)state;
+
+    assert(xs->num > 0);
+    assert(xs->num <= rangeListCapacity(info));
+    assert(rangeListIsOrdered(xs, ring));
+
+    // Walk the ring. For each entry x:
+    //   if (offset - x) falls inside repeat bounds, return success.
+
+    // It may be worth doing tests on first and last elements first to bail
+    // early if the whole ring is too young or stale.
+
+    DEBUG_PRINTF("check %u (of %u) elements, offset %llu, bounds={%u,%u}\n",
+                 xs->num, rangeListCapacity(info), offset,
+                 info->repeatMin, info->repeatMax);
+#ifdef DEBUG
+    dumpRange(info, xs, ring);
+#endif
+
+    // Quick pre-check for minimum.
+    assert(offset >= xs->offset);
+    if (offset - xs->offset < info->repeatMin) {
+        DEBUG_PRINTF("haven't even seen repeatMin bytes yet!\n");
+        return REPEAT_NOMATCH;
+    }
+
+    // We check the most recent offset first, as we can establish staleness.
+    u64a match = xs->offset + unaligned_load_u16(ring + xs->num - 1);
+    assert(offset >= match);
+    u64a diff = offset - match;
+    if (diff > info->repeatMax) {
+        DEBUG_PRINTF("range list is stale\n");
+        return REPEAT_STALE;
+    } else if (diff >= info->repeatMin && diff <= info->repeatMax) {
+        return REPEAT_MATCH;
+    }
+
+    // Check the other offsets in the list.
+    u32 count = xs->num - 1;
+    for (u32 i = 0; i < count; i++) {
+        match = xs->offset + unaligned_load_u16(ring + i);
+        assert(offset >= match);
+        diff = offset - match;
+        if (diff >= info->repeatMin && diff <= info->repeatMax) {
+            return REPEAT_MATCH;
+        }
+    }
+
+    return REPEAT_NOMATCH;
+}
+
+enum RepeatMatch repeatHasMatchBitmap(const struct RepeatInfo *info,
+                                      const union RepeatControl *ctrl,
+                                      u64a offset) {
+    const struct RepeatBitmapControl *xs = &ctrl->bitmap;
+
+    DEBUG_PRINTF("checking if offset=%llu is a match\n", offset);
+
+#ifdef DEBUG
+    dumpBitmap(xs);
+#endif
+
+    u64a bitmap = xs->bitmap;
+    if (!bitmap) {
+        DEBUG_PRINTF("no tops; stale\n");
+        return REPEAT_STALE;
+    }
+
+    // Quick pre-check for minimum.
+    const u64a base = xs->offset;
+    assert(offset >= base);
+    if (offset - base < info->repeatMin) {
+        DEBUG_PRINTF("haven't even seen repeatMin bytes yet!\n");
+        return REPEAT_NOMATCH;
+    }
+
+    // We check the most recent offset first, as we can establish staleness.
+    u64a match = base + findAndClearMSB_64(&bitmap);
+    DEBUG_PRINTF("offset=%llu, last_match %llu\n", offset, match);
+    assert(offset >= match);
+    u64a diff = offset - match;
+    if (diff > info->repeatMax) {
+        DEBUG_PRINTF("stale\n");
+        return REPEAT_STALE;
+    } else if (diff >= info->repeatMin && diff <= info->repeatMax) {
+        return REPEAT_MATCH;
+    }
+
+    while (bitmap) {
+        match = base + findAndClearLSB_64(&bitmap);
+        DEBUG_PRINTF("offset=%llu, last_match %llu\n", offset, match);
+        assert(offset >= match);
+        diff = offset - match;
+        if (diff >= info->repeatMin && diff <= info->repeatMax) {
+            return REPEAT_MATCH;
+        }
+    }
+
+    return REPEAT_NOMATCH;
+}
+
+enum RepeatMatch repeatHasMatchTrailer(const struct RepeatInfo *info,
+                                       const union RepeatControl *ctrl,
+                                       u64a offset) {
+    const struct RepeatTrailerControl *xs = &ctrl->trailer;
+    const u32 m_width = info->repeatMax - info->repeatMin;
+
+    DEBUG_PRINTF("offset=%llu, xs->offset=%llu, xs->bitmap=0x%llx\n", offset,
+                 xs->offset, xs->bitmap);
+
+    if (offset > xs->offset + m_width) {
+        DEBUG_PRINTF("stale\n");
+        return REPEAT_STALE;
+    }
+
+    if (offset >= xs->offset) {
+        DEBUG_PRINTF("in match window\n");
+        return REPEAT_MATCH;
+    }
+
+    if (offset >= xs->offset - info->repeatMin) {
+        u32 idx = xs->offset - offset - 1;
+        DEBUG_PRINTF("check bitmap idx %u\n", idx);
+        assert(idx < 64);
+        if (xs->bitmap & (1ULL << idx)) {
+            DEBUG_PRINTF("match in bitmap\n");
+            return REPEAT_MATCH;
+        }
+    }
+
+    DEBUG_PRINTF("no match\n");
+    return REPEAT_NOMATCH;
+}
+
+/** \brief True if the given value can be packed into len bytes.  */
+static really_inline
+int fits_in_len_bytes(u64a val, u32 len) {
+    if (len >= 8) {
+        return 1;
+    }
+    return val <= (1ULL << (len * 8));
+}
+
+static really_inline
+void storePackedRelative(char *dest, u64a val, u64a offset, u64a max, u32 len) {
+    assert(val <= offset);
+    assert(fits_in_len_bytes(max, len));
+    u64a delta = offset - val;
+    if (delta >= max) {
+        delta = max;
+    }
+    DEBUG_PRINTF("delta %llu\n", delta);
+    assert(fits_in_len_bytes(delta, len));
+    partial_store_u64a(dest, delta, len);
+}
+
+static
+void repeatPackRing(char *dest, const struct RepeatInfo *info,
+                    const union RepeatControl *ctrl, u64a offset) {
+    const struct RepeatRingControl *xs = &ctrl->ring;
+    const u32 ring_indices_len = info->repeatMax < 254 ? 2 : 4;
+    const u32 offset_len = info->packedCtrlSize - ring_indices_len;
+
+    // Write out packed relative base offset.
+    assert(info->packedCtrlSize > ring_indices_len);
+    storePackedRelative(dest, xs->offset, offset, info->horizon, offset_len);
+
+    // Write out ring indices.
+    if (ring_indices_len == 4) {
+        unaligned_store_u16(dest + offset_len, xs->first);
+        unaligned_store_u16(dest + offset_len + 2, xs->last);
+    } else {
+        assert(xs->first < 256 && xs->last < 256);
+        u8 *indices = (u8 *)dest + offset_len;
+        indices[0] = xs->first;
+        indices[1] = xs->last;
+    }
+}
+
+static
+void repeatPackOffset(char *dest, const struct RepeatInfo *info,
+                      const union RepeatControl *ctrl, u64a offset) {
+    const struct RepeatOffsetControl *xs = &ctrl->offset;
+    DEBUG_PRINTF("packing offset %llu [h %u]\n", xs->offset, info->horizon);
+    if (!info->packedCtrlSize) {
+        assert(info->type == REPEAT_ALWAYS);
+        DEBUG_PRINTF("externally guarded .*\n");
+        return;
+    }
+    storePackedRelative(dest, xs->offset, offset, info->horizon,
+                        info->packedCtrlSize);
+}
+
+static
+void repeatPackRange(char *dest, const struct RepeatInfo *info,
+                     const union RepeatControl *ctrl, u64a offset) {
+    const struct RepeatRangeControl *xs = &ctrl->range;
+
+    // Write out packed relative base offset.
+    assert(info->packedCtrlSize > 1);
+    storePackedRelative(dest, xs->offset, offset, info->horizon,
+                        info->packedCtrlSize - 1);
+
+    // Write out range number of elements.
+    dest[info->packedCtrlSize - 1] = xs->num;
+}
+
+static
+void repeatPackBitmap(char *dest, const struct RepeatInfo *info,
+                      const union RepeatControl *ctrl, u64a offset) {
+    const struct RepeatBitmapControl *xs = &ctrl->bitmap;
+    const u32 bound = info->repeatMax;
+
+    assert(offset >= xs->offset);
+    u64a new_base = offset > bound ? offset - bound : 0;
+
+    // Shift bitmap to begin at new_base rather than xs->offset.
+    u64a bitmap = xs->bitmap;
+    if (new_base >= xs->offset) {
+        u64a shift = new_base - xs->offset;
+        bitmap = shift < 64 ? bitmap >> shift : 0;
+    } else {
+        u64a shift = xs->offset - new_base;
+        bitmap = shift < 64 ? bitmap << shift : 0;
+    }
+
+    DEBUG_PRINTF("packing %llu into %u bytes\n", bitmap, info->packedCtrlSize);
+
+    // Write out packed bitmap.
+    assert(fits_in_len_bytes(bitmap, info->packedCtrlSize));
+    partial_store_u64a(dest, bitmap, info->packedCtrlSize);
+}
+
+static
+void repeatPackSparseOptimalP(char *dest, const struct RepeatInfo *info,
+                             const union RepeatControl *ctrl, u64a offset) {
+    const struct RepeatRingControl *xs = &ctrl->ring;
+    // set ring index pointer according to patch count
+    const u32 ring_indices_len = info->patchCount < 254 ? 2 : 4;
+    const u32 offset_len = info->packedCtrlSize - ring_indices_len;
+
+    // Write out packed relative base offset.
+    assert(info->packedCtrlSize > ring_indices_len);
+    storePackedRelative(dest, xs->offset, offset, info->horizon, offset_len);
+
+    // Write out ring indices.
+    if (ring_indices_len == 4) {
+        unaligned_store_u16(dest + offset_len, xs->first);
+        unaligned_store_u16(dest + offset_len + 2, xs->last);
+    } else {
+        assert(xs->first < 256 && xs->last < 256);
+        u8 *indices = (u8 *)dest + offset_len;
+        indices[0] = xs->first;
+        indices[1] = xs->last;
+    }
+
+}
+
+static
+void repeatPackTrailer(char *dest, const struct RepeatInfo *info,
+                       const union RepeatControl *ctrl, u64a offset) {
+    const struct RepeatTrailerControl *xs = &ctrl->trailer;
+
+    DEBUG_PRINTF("saving: offset=%llu, xs->offset=%llu, xs->bitmap=0x%llx\n",
+                 offset, xs->offset, xs->bitmap);
+
+    // XXX: xs->offset may be zero in the NFA path (effectively uninitialized).
+    u64a top;
+    if (xs->offset) {
+        assert(xs->offset >= info->repeatMin);
+        top = xs->offset - info->repeatMin;
+    } else {
+        top = 0;
+    }
+
+    top = offset - top; // Pack top relative to offset.
+
+    u64a v[2];
+    v[0] = MIN(top, info->horizon);
+    v[1] = xs->bitmap;
+
+    pack_bits_64(dest, v, info->packedFieldSizes, 2);
+}
+
+void repeatPack(char *dest, const struct RepeatInfo *info,
+                const union RepeatControl *ctrl, u64a offset) {
+    assert(dest && info && ctrl);
+
+    switch ((enum RepeatType)info->type) {
+    case REPEAT_RING:
+        repeatPackRing(dest, info, ctrl, offset);
+        break;
+    case REPEAT_FIRST:
+    case REPEAT_LAST:
+        repeatPackOffset(dest, info, ctrl, offset);
+        break;
+    case REPEAT_RANGE:
+        repeatPackRange(dest, info, ctrl, offset);
+        break;
+    case REPEAT_BITMAP:
+        repeatPackBitmap(dest, info, ctrl, offset);
+        break;
+    case REPEAT_SPARSE_OPTIMAL_P:
+        repeatPackSparseOptimalP(dest, info, ctrl, offset);
+        break;
+    case REPEAT_TRAILER:
+        repeatPackTrailer(dest, info, ctrl, offset);
+        break;
+    case REPEAT_ALWAYS:
+        /* nothing to do - no state */
+        break;
+    }
+}
+
+static really_inline
+u64a loadPackedRelative(const char *src, u64a offset, u32 len) {
+    u64a delta = partial_load_u64a(src, len);
+    DEBUG_PRINTF("delta %llu\n", delta);
+    assert(offset >= delta);
+    return offset - delta;
+}
+
+static
+void repeatUnpackRing(const char *src, const struct RepeatInfo *info,
+                      u64a offset, union RepeatControl *ctrl) {
+    struct RepeatRingControl *xs = &ctrl->ring;
+    const u32 ring_indices_len = info->repeatMax < 254 ? 2 : 4;
+    const u32 offset_len = info->packedCtrlSize - ring_indices_len;
+    xs->offset = loadPackedRelative(src, offset, offset_len);
+    if (ring_indices_len == 4) {
+        xs->first = unaligned_load_u16(src + offset_len);
+        xs->last = unaligned_load_u16(src + offset_len + 2);
+    } else {
+        const u8 *indices = (const u8 *)src + offset_len;
+        xs->first = indices[0];
+        xs->last = indices[1];
+    }
+}
+
+static
+void repeatUnpackOffset(const char *src, const struct RepeatInfo *info,
+                        u64a offset, union RepeatControl *ctrl) {
+    struct RepeatOffsetControl *xs = &ctrl->offset;
+    if (!info->packedCtrlSize) {
+        assert(info->type == REPEAT_ALWAYS);
+        DEBUG_PRINTF("externally guarded .*\n");
+        xs->offset = 0;
+    } else {
+        xs->offset = loadPackedRelative(src, offset, info->packedCtrlSize);
+    }
+    DEBUG_PRINTF("unpacking offset %llu [h%u]\n", xs->offset,
+                 info->horizon);
+}
+
+static
+void repeatUnpackRange(const char *src, const struct RepeatInfo *info,
+                       u64a offset, union RepeatControl *ctrl) {
+    struct RepeatRangeControl *xs = &ctrl->range;
+    xs->offset = loadPackedRelative(src, offset, info->packedCtrlSize - 1);
+    xs->num = src[info->packedCtrlSize - 1];
+}
+
+static
+void repeatUnpackBitmap(const char *src, const struct RepeatInfo *info,
+                        u64a offset, union RepeatControl *ctrl) {
+    struct RepeatBitmapControl *xs = &ctrl->bitmap;
+    xs->offset = offset > info->repeatMax ? offset - info->repeatMax : 0;
+    xs->bitmap = partial_load_u64a(src, info->packedCtrlSize);
+}
+
+static
+void repeatUnpackSparseOptimalP(const char *src, const struct RepeatInfo *info,
+                                u64a offset, union RepeatControl *ctrl) {
+    struct RepeatRingControl *xs = &ctrl->ring;
+    const u32 ring_indices_len = info->patchCount < 254 ? 2 : 4;
+    const u32 offset_len = info->packedCtrlSize - ring_indices_len;
+    xs->offset = loadPackedRelative(src, offset, offset_len);
+    if (ring_indices_len == 4) {
+        xs->first = unaligned_load_u16(src + offset_len);
+        xs->last = unaligned_load_u16(src + offset_len + 2);
+    } else {
+        const u8 *indices = (const u8 *)src + offset_len;
+        xs->first = indices[0];
+        xs->last = indices[1];
+    }
+}
+
+static
+void repeatUnpackTrailer(const char *src, const struct RepeatInfo *info,
+                         u64a offset, union RepeatControl *ctrl) {
+    struct RepeatTrailerControl *xs = &ctrl->trailer;
+
+    u64a v[2];
+    unpack_bits_64(v, (const u8 *)src, info->packedFieldSizes, 2);
+
+    xs->offset = offset - v[0] + info->repeatMin;
+    xs->bitmap = v[1];
+
+    DEBUG_PRINTF("loaded: xs->offset=%llu, xs->bitmap=0x%llx\n", xs->offset,
+                 xs->bitmap);
+}
+
+void repeatUnpack(const char *src, const struct RepeatInfo *info, u64a offset,
+                  union RepeatControl *ctrl) {
+    assert(src && info && ctrl);
+
+    switch ((enum RepeatType)info->type) {
+    case REPEAT_RING:
+        repeatUnpackRing(src, info, offset, ctrl);
+        break;
+    case REPEAT_FIRST:
+    case REPEAT_LAST:
+        repeatUnpackOffset(src, info, offset, ctrl);
+        break;
+    case REPEAT_RANGE:
+        repeatUnpackRange(src, info, offset, ctrl);
+        break;
+    case REPEAT_BITMAP:
+        repeatUnpackBitmap(src, info, offset, ctrl);
+        break;
+    case REPEAT_SPARSE_OPTIMAL_P:
+        repeatUnpackSparseOptimalP(src, info, offset, ctrl);
+        break;
+    case REPEAT_TRAILER:
+        repeatUnpackTrailer(src, info, offset, ctrl);
+        break;
+    case REPEAT_ALWAYS:
+        /* nothing to do - no state */
+        break;
+    }
+}
+
+static really_inline
+const u64a *getImplTable(const struct RepeatInfo *info) {
+    const u64a *table = ((const u64a *)(ROUNDUP_PTR(
+                                        ((const char *)(info) +
+                                        sizeof(*info)),
+                                        alignof(u64a))));
+    return table;
+}
+
+static
+void storeInitialRingTopPatch(const struct RepeatInfo *info,
+                              struct RepeatRingControl *xs,
+                              u8 *state, u64a offset) {
+    DEBUG_PRINTF("set the first patch, offset=%llu\n", offset);
+    xs->offset = offset;
+
+    u8 *active = state;
+    u32 patch_count = info->patchCount;
+    mmbit_clear(active, patch_count);
+    mmbit_set(active, patch_count, 0);
+
+    u8 *ring = active + info->patchesOffset;
+    u32 encoding_size = info->encodingSize;
+    partial_store_u64a(ring, 1ull, encoding_size);
+    xs->first = 0;
+    xs->last = 1;
+}
+
+static
+u32 getSparseOptimalTargetValue(const struct RepeatInfo *info,
+                                const u32 tval, u64a *val) {
+    u32 patch_size = info->patchSize;
+    const u64a *repeatTable = getImplTable(info);
+    u32 loc = 0;
+    DEBUG_PRINTF("val:%llu \n", *val);
+    for (u32 i = 1; i <= patch_size - tval; i++) {
+        u64a tmp = repeatTable[patch_size - i];
+        if (*val >= tmp) {
+            *val -= tmp;
+            loc = i;
+            i += (info->minPeriod - 1);
+        }
+    }
+
+    return loc;
+}
+
+static
+u64a sparseLastTop(const struct RepeatInfo *info,
+                   const struct RepeatRingControl *xs, const u8 *state) {
+    DEBUG_PRINTF("looking for last top\n");
+    u32 patch_size = info->patchSize;
+    u32 patch_count = info->patchCount;
+    u32 encoding_size = info->encodingSize;
+
+    u32 occ = ringOccupancy(xs, patch_count);
+    u32 patch = xs->first + occ - 1;
+    if (patch >= patch_count) {
+        patch -= patch_count;
+    }
+
+    DEBUG_PRINTF("patch%u encoding_size%u occ%u\n", patch, encoding_size, occ);
+    const u8 *ring = state + info->patchesOffset;
+    u64a val = partial_load_u64a(ring + encoding_size * patch, encoding_size);
+
+    DEBUG_PRINTF("val:%llu\n", val);
+    const u64a *repeatTable = getImplTable(info);
+    for (s32 i = patch_size - 1; i >= 0; i--) {
+        if (val >= repeatTable[i]) {
+            DEBUG_PRINTF("xs->offset%llu v%u p%llu\n",
+                         xs->offset, i, repeatTable[i]);
+            return xs->offset + i + (occ - 1) * patch_size;
+        }
+    }
+
+    assert(0);
+    return 0;
+}
+
+u64a repeatLastTopSparseOptimalP(const struct RepeatInfo *info,
+                                 const union RepeatControl *ctrl,
+                                 const void *state) {
+    return sparseLastTop(info, &ctrl->ring, state);
+}
+
+u64a repeatNextMatchSparseOptimalP(const struct RepeatInfo *info,
+                                   const union RepeatControl *ctrl,
+                                   const void *state, u64a offset) {
+    const struct RepeatRingControl *xs = &ctrl->ring;
+
+    DEBUG_PRINTF("repeat [%u, %u] looking for match after %llu\n",
+                 info->repeatMin, info->repeatMax, offset);
+
+    assert(offset >= xs->offset);
+
+    u64a nextOffset = offset + 1;
+
+    u32 patch_size = info->patchSize;
+    u32 patch;
+    u32 tval;
+    if (nextOffset <= xs->offset + info->repeatMin) {
+        patch = xs->first;
+        tval = 0;
+    } else if (nextOffset > sparseLastTop(info, xs, state) + info->repeatMax) {
+        DEBUG_PRINTF("ring is stale\n");
+        return 0;
+    } else {
+        assert(nextOffset - xs->offset < UINT32_MAX); // ring is not stale
+        u32 delta = (u32)(nextOffset - xs->offset);
+        u32 lower = delta > info->repeatMax ? delta - info->repeatMax : 0;
+        patch = lower / patch_size;
+        tval = lower - patch * patch_size;
+    }
+
+    DEBUG_PRINTF("patch %u\n", patch);
+    u32 patch_count = info->patchCount;
+    if (patch >= patch_count) {
+        return 0;
+    }
+
+    DEBUG_PRINTF("initial test for %u\n", tval);
+
+    u32 begin = xs->first + patch;
+    if (begin >= patch_count) {
+        begin -= patch_count;
+    }
+
+    const u8 *active = (const u8 *)state;
+    const u8 *ring = active + info->patchesOffset;
+    u32 encoding_size = info->encodingSize;
+    const u32 end = begin >= xs->last ? patch_count : xs->last;
+    u32 low = tval;
+    u64a diff = 0, loc = 0;
+    DEBUG_PRINTF("begin %u end %u\n", begin, end);
+    for (u32 p = mmbit_iterate_bounded(active, patch_count, begin, end);
+         p != MMB_INVALID; p = mmbit_iterate_bounded(active, patch_count,
+         p + 1, end)) {
+        if (p != begin) {
+            low = 0;
+        }
+
+        u64a val = partial_load_u64a(ring + encoding_size * p, encoding_size);
+        u32 p1 = 0;
+        if (p >= xs->first) {
+            p1 = p - xs->first;
+        } else {
+            p1 = p + patch_count - xs->first;
+        }
+
+        if (val) {
+            loc = getSparseOptimalTargetValue(info, low, &val);
+            diff = (p1 + 1) * patch_size - loc;
+        }
+        if (loc) {
+            u64a rv = MAX(nextOffset, xs->offset + info->repeatMin + diff);
+            DEBUG_PRINTF("offset%llu next match at %llu\n", xs->offset, rv);
+            return rv;
+        }
+        low = 0;
+    }
+
+    low = 0;
+    if (begin >= xs->last) {
+        for (u32 p = mmbit_iterate_bounded(active, patch_count, 0, xs->last);
+             p != MMB_INVALID; p = mmbit_iterate_bounded(active, patch_count,
+             p + 1, xs->last)) {
+
+            u64a val = partial_load_u64a(ring + encoding_size * p,
+                                         encoding_size);
+            if (val) {
+                loc = getSparseOptimalTargetValue(info, low, &val);
+                diff = (p + 1) * patch_size - loc;
+            }
+            if (loc) {
+                u64a rv = MAX(nextOffset, xs->offset + info->repeatMin +
+                              diff + (end - xs->first) * patch_size);
+                DEBUG_PRINTF("next match at %llu\n", rv);
+                return rv;
+            }
+        }
+    }
+
+    DEBUG_PRINTF("next match\n");
+    return 0;
+}
+
+void repeatStoreSparseOptimalP(const struct RepeatInfo *info,
+                               union RepeatControl *ctrl, void *state,
+                               u64a offset, char is_alive) {
+    struct RepeatRingControl *xs = &ctrl->ring;
+    u8 *active = (u8 *)state;
+
+    DEBUG_PRINTF("offset: %llu encoding_size: %u\n", offset,
+                 info->encodingSize);
+
+    // If (a) this is the first top, or (b) the ring is stale, initialize the
+    // ring and write this offset in as the first top.
+    if (!is_alive ||
+        offset > sparseLastTop(info, xs, state) + info->repeatMax) {
+        storeInitialRingTopPatch(info, xs, active, offset);
+        return;
+    }
+
+    // Tops should arrive in order, with no duplicates.
+    assert(offset > sparseLastTop(info, xs, state));
+
+    // As the ring is not stale, our delta should fit within a u32.
+    assert(offset - xs->offset <= UINT32_MAX);
+    u32 delta = (u32)(offset - xs->offset);
+    u32 patch_size = info->patchSize;
+    u32 patch_count = info->patchCount;
+    u32 encoding_size = info->encodingSize;
+    u32 patch = delta / patch_size;
+
+    DEBUG_PRINTF("delta=%u, patch_size=%u, patch=%u\n", delta, patch_size,
+                 patch);
+
+    u8 *ring = active + info->patchesOffset;
+    u32 occ = ringOccupancy(xs, patch_count);
+    u64a val = 0;
+    u32 idx;
+
+    DEBUG_PRINTF("patch: %u patch_count: %u occ: %u\n",
+                 patch, patch_count, occ);
+    if (patch >= patch_count) {
+        u32 patch_shift_count = patch - patch_count + 1;
+        assert(patch >= patch_shift_count);
+        DEBUG_PRINTF("shifting by %u\n", patch_shift_count);
+        xs->offset += patch_size * patch_shift_count;
+        xs->first += patch_shift_count;
+        if (xs->first >= patch_count) {
+            xs->first -= patch_count;
+        }
+        idx = xs->last + patch - occ;
+        mmbit_unset_range(active, patch_count, xs->last,
+                          MIN(idx, patch_count));
+        if (idx >= patch_count) {
+            idx -= patch_count;
+            mmbit_unset_range(active, patch_count, 0, idx + 1);
+        }
+        xs->last = idx + 1;
+        if (xs->last == patch_count) {
+            xs->last = 0;
+        }
+    } else if (patch < occ) {
+        assert(patch == occ - 1);
+        idx = xs->last == 0 ? patch_count - 1 : (u32)xs->last - 1;
+        val = partial_load_u64a(ring + encoding_size * idx, encoding_size);
+    } else {
+        idx = xs->last + patch - occ;
+        mmbit_unset_range(active, patch_count, xs->last,
+                          MIN(idx, patch_count));
+        if (idx >= patch_count) {
+            idx -= patch_count;
+            mmbit_unset_range(active, patch_count, 0, idx + 1);
+        }
+        xs->last = idx + 1;
+        if (xs->last == patch_count) {
+            xs->last = 0;
+        }
+    }
+
+    assert((u64a)patch * patch_size <= delta);
+    u32 diff = delta - patch * patch_size;
+    const u64a *repeatTable = getImplTable(info);
+    val += repeatTable[diff];
+
+    DEBUG_PRINTF("patch=%u, occ=%u\n", patch, occ);
+    DEBUG_PRINTF("xs->first:%u xs->last:%u patch:%u\n",
+                 xs->first, xs->last, patch);
+    DEBUG_PRINTF("value:%llu\n", val);
+    assert(fits_in_len_bytes(val, encoding_size));
+    partial_store_u64a(ring + encoding_size * idx, val, encoding_size);
+    mmbit_set(active, patch_count, idx);
+}
+
+static
+char sparseHasMatch(const struct RepeatInfo *info, const u8 *state,
+                    u32 lower, u32 upper) {
+    u32 patch_size = info->patchSize;
+    u32 patch_count = info->patchCount;
+    u32 encoding_size = info->encodingSize;
+    u32 patch_lower = lower / patch_size;
+    u32 patch_upper = upper / patch_size;
+    u32 diff = lower - patch_lower * patch_size;
+
+    DEBUG_PRINTF("lower=%u, upper=%u\n", lower, upper);
+    const u64a *repeatTable = getImplTable(info);
+
+    const u8 *ring = state + info->patchesOffset;
+    const u8 *active = state;
+    u64a val;
+    // test the first patch
+    if (mmbit_isset(active, patch_count, patch_lower)) {
+        val = partial_load_u64a(ring + encoding_size * patch_lower,
+                                encoding_size);
+        DEBUG_PRINTF("patch_size=%u, diff=%u, table=%llu\n",
+                     patch_size, diff, repeatTable[diff]);
+        DEBUG_PRINTF("patch_lower=%u, patch_upper=%u\n",
+                     patch_lower, patch_upper);
+        if (patch_upper == patch_lower) {
+            u32 limit = upper - patch_lower * patch_size;
+            getSparseOptimalTargetValue(info, limit + 1, &val);
+        }
+        if (val >= repeatTable[diff]) {
+            return 1;
+        }
+    }
+
+    if (patch_lower == patch_upper) {
+        return 0;
+    }
+
+    // test the patches between first and last
+    u32 m = mmbit_iterate_bounded(active, patch_count,
+                                  patch_lower + 1, patch_upper);
+    if (m != MMB_INVALID) {
+        return 1;
+    }
+
+    if (patch_upper == patch_count) {
+        return 0;
+    }
+
+    // test the last patch
+    if (!mmbit_isset(active, patch_count, patch_upper)) {
+        return 0;
+    }
+    diff = (patch_upper + 1) * patch_size - upper;
+    DEBUG_PRINTF("diff=%u\n", diff);
+    val = partial_load_u64a(ring + encoding_size * patch_upper, encoding_size);
+    getSparseOptimalTargetValue(info, patch_size - diff + 1, &val);
+    if (val) {
+        DEBUG_PRINTF("last patch: val=%llu\n", val);
+        return 1;
+    }
+
+    return 0;
+}
+
+enum RepeatMatch repeatHasMatchSparseOptimalP(const struct RepeatInfo *info,
+                                              const union RepeatControl *ctrl,
+                                              const void *state, u64a offset) {
+    DEBUG_PRINTF("check for match at %llu corresponding to trigger "
+                 "at [%llu, %llu]\n", offset, offset - info->repeatMax,
+                 offset - info->repeatMin);
+
+    const struct RepeatRingControl *xs = &ctrl->ring;
+    const u8 *ring = (const u8 *)state;
+
+    assert(offset >= xs->offset);
+
+    if (offset < xs->offset + info->repeatMin) {
+        DEBUG_PRINTF("too soon\n");
+        return REPEAT_NOMATCH;
+    } else if (offset > sparseLastTop(info, xs, state) + info->repeatMax) {
+        DEBUG_PRINTF("stale\n");
+        return REPEAT_STALE;
+    }
+
+    // Our delta between the base offset of the ring and the current offset
+    // must fit within the range [repeatMin, lastPossibleTop + repeatMax]. This
+    // range fits comfortably within a u32.
+    assert(offset - xs->offset <= UINT32_MAX);
+
+    u32 delta = (u32)(offset - xs->offset);
+    u32 patch_size = info->patchSize;
+    u32 patch_count = info->patchCount;
+    u32 occ = ringOccupancy(xs, patch_count);
+
+    u32 lower = delta > info->repeatMax ? delta - info->repeatMax : 0;
+    u32 upper = MIN(delta - info->repeatMin, occ * patch_size - 1);
+
+    DEBUG_PRINTF("lower=%u, upper=%u\n", lower, upper);
+    u32 patch_lower = lower / patch_size;
+    u32 patch_upper = upper / patch_size;
+
+    if (patch_lower >= occ) {
+        DEBUG_PRINTF("too late\n");
+        return REPEAT_NOMATCH;
+    }
+
+    u32 remaining_lower = lower - patch_lower * patch_size;
+    u32 remaining_upper = upper - patch_upper * patch_size;
+    patch_lower += xs->first;
+    patch_upper += xs->first;
+    if (patch_lower >= patch_count) {
+        patch_lower -= patch_count;
+        patch_upper -= patch_count;
+    } else if (patch_upper >= patch_count) {
+        patch_upper -= patch_count;
+    }
+
+    DEBUG_PRINTF("xs->first:%u xs->last:%u patch_lower:%u, patch_upper:%u\n",
+                 xs->first, xs->last, patch_lower, patch_upper);
+
+    u32 scan_end;
+    const char is_not_wrapped = (patch_lower <= patch_upper);
+    if (is_not_wrapped) {
+        scan_end = patch_upper * patch_size + remaining_upper;
+    } else {
+        scan_end = patch_count * patch_size;
+    }
+
+    lower = patch_lower * patch_size + remaining_lower;
+    if (sparseHasMatch(info, ring, lower, scan_end)) {
+        return REPEAT_MATCH;
+    }
+
+    if (!is_not_wrapped) {
+        upper -= (patch_count - xs->first) * patch_size;
+        if (sparseHasMatch(info, ring, 0, upper)) {
+            return REPEAT_MATCH;
+        }
+    }
+
+    return REPEAT_NOMATCH;
+}
diff --git a/regex/nfa/repeat.h b/regex/nfa/repeat.h
new file mode 100644
index 000000000..d4f84ea0a
--- /dev/null
+++ b/regex/nfa/repeat.h
@@ -0,0 +1,370 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief API for handling bounded repeats.
+ *
+ * This file provides an internal API for handling bounded repeats of character
+ * classes. It is used by the Large Bounded Repeat (LBR) engine and by the
+ * bounded repeat handling in the LimEx NFA engine as well.
+ *
+ * The state required by these functions is split into two regions:
+ *
+ * 1. Control block. This is a small structure (size varies with repeat mode)
+ *    that may be copied around or compressed into stream state.
+ * 2. Repeat state. This is a larger structure that can be quite big for large
+ *    repeats, often containing a multibit ring or large vector of indices.
+ *    This generally lives in stream state and is not copied.
+ */
+
+#ifndef REPEAT_H
+#define REPEAT_H
+
+#include "ue2common.h"
+#include "repeat_internal.h"
+#include "util/bitutils.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/** Returns the offset of the most recent 'top' offset set in the repeat. */
+static really_inline
+u64a repeatLastTop(const struct RepeatInfo *info,
+                   const union RepeatControl *ctrl, const void *state);
+
+/** Returns the offset of the next match after 'offset', or zero if no further
+ * matches are possible. */
+static really_inline
+u64a repeatNextMatch(const struct RepeatInfo *info,
+                     const union RepeatControl *ctrl, const void *state,
+                     u64a offset);
+
+/** Stores a new top in the repeat. If is_alive is false, the repeat will be
+ * initialised first and this top will become the first (and only) one. */
+static really_inline
+void repeatStore(const struct RepeatInfo *info, union RepeatControl *ctrl,
+                 void *state, u64a offset, char is_alive);
+
+/** Return type for repeatHasMatch. */
+enum RepeatMatch {
+    REPEAT_NOMATCH, /**< This offset is not a valid match. */
+    REPEAT_MATCH, /**< This offset is a valid match. */
+    REPEAT_STALE /**< This offset is not a valid match and no greater
+                      offset will be (unless another top is stored). */
+};
+
+/** Query whether the repeat has a match at the given offset. Returns
+ * ::REPEAT_STALE if it does not have a match at that offset _and_
+ * no further matches are possible. */
+static really_inline
+enum RepeatMatch repeatHasMatch(const struct RepeatInfo *info,
+                                const union RepeatControl *ctrl,
+                                const void *state, u64a offset);
+
+/** \brief Serialize a packed version of the repeat control block into stream
+ * state. */
+void repeatPack(char *dest, const struct RepeatInfo *info,
+                const union RepeatControl *ctrl, u64a offset);
+
+/** \brief Deserialize a packed version of the repeat control block. */
+void repeatUnpack(const char *src, const struct RepeatInfo *info, u64a offset,
+                  union RepeatControl *ctrl);
+
+////
+//// IMPLEMENTATION.
+////
+
+u64a repeatLastTopRing(const struct RepeatInfo *info,
+                       const union RepeatControl *ctrl);
+
+u64a repeatLastTopRange(const union RepeatControl *ctrl,
+                        const void *state);
+
+u64a repeatLastTopBitmap(const union RepeatControl *ctrl);
+
+u64a repeatLastTopTrailer(const struct RepeatInfo *info,
+                          const union RepeatControl *ctrl);
+
+u64a repeatLastTopSparseOptimalP(const struct RepeatInfo *info,
+                                 const union RepeatControl *ctrl,
+                                 const void *state);
+
+static really_inline
+u64a repeatLastTop(const struct RepeatInfo *info,
+                   const union RepeatControl *ctrl, const void *state) {
+    assert(info && ctrl && state);
+
+    switch ((enum RepeatType)info->type) {
+    case REPEAT_RING:
+        return repeatLastTopRing(info, ctrl);
+    case REPEAT_FIRST:
+    case REPEAT_LAST:
+        return ctrl->offset.offset;
+    case REPEAT_RANGE:
+        return repeatLastTopRange(ctrl, state);
+    case REPEAT_BITMAP:
+        return repeatLastTopBitmap(ctrl);
+    case REPEAT_SPARSE_OPTIMAL_P:
+        return repeatLastTopSparseOptimalP(info, ctrl, state);
+    case REPEAT_TRAILER:
+        return repeatLastTopTrailer(info, ctrl);
+    case REPEAT_ALWAYS:
+        return 0;
+    }
+
+    DEBUG_PRINTF("bad repeat type %u\n", info->type);
+    assert(0);
+    return 0;
+}
+
+// Used for both FIRST and LAST models.
+static really_inline
+u64a repeatNextMatchOffset(const struct RepeatInfo *info,
+                           const union RepeatControl *ctrl, u64a offset) {
+    u64a first = ctrl->offset.offset + info->repeatMin;
+    if (offset < first) {
+        return first;
+    }
+
+    if (info->repeatMax == REPEAT_INF ||
+        offset < ctrl->offset.offset + info->repeatMax) {
+        return offset + 1;
+    }
+
+    return 0; // No more matches.
+}
+
+u64a repeatNextMatchRing(const struct RepeatInfo *info,
+                         const union RepeatControl *ctrl,
+                         const void *state, u64a offset);
+
+u64a repeatNextMatchRange(const struct RepeatInfo *info,
+                          const union RepeatControl *ctrl,
+                          const void *state, u64a offset);
+
+u64a repeatNextMatchBitmap(const struct RepeatInfo *info,
+                           const union RepeatControl *ctrl, u64a offset);
+
+u64a repeatNextMatchSparseOptimalP(const struct RepeatInfo *info,
+                                   const union RepeatControl *ctrl,
+                                   const void *state, u64a offset);
+
+u64a repeatNextMatchTrailer(const struct RepeatInfo *info,
+                            const union RepeatControl *ctrl, u64a offset);
+
+static really_inline
+u64a repeatNextMatch(const struct RepeatInfo *info,
+                     const union RepeatControl *ctrl, const void *state,
+                     u64a offset) {
+    assert(info && ctrl && state);
+    assert(ISALIGNED(info));
+    assert(ISALIGNED(ctrl));
+
+    switch ((enum RepeatType)info->type) {
+    case REPEAT_RING:
+        return repeatNextMatchRing(info, ctrl, state, offset);
+    case REPEAT_FIRST:
+    // fall through
+    case REPEAT_LAST:
+        return repeatNextMatchOffset(info, ctrl, offset);
+    case REPEAT_RANGE:
+        return repeatNextMatchRange(info, ctrl, state, offset);
+    case REPEAT_BITMAP:
+        return repeatNextMatchBitmap(info, ctrl, offset);
+    case REPEAT_SPARSE_OPTIMAL_P:
+        return repeatNextMatchSparseOptimalP(info, ctrl, state, offset);
+    case REPEAT_TRAILER:
+        return repeatNextMatchTrailer(info, ctrl, offset);
+    case REPEAT_ALWAYS:
+        return offset + 1;
+    }
+
+    DEBUG_PRINTF("bad repeat type %u\n", info->type);
+    assert(0);
+    return 0;
+}
+
+static really_inline
+void repeatStoreFirst(union RepeatControl *ctrl, u64a offset,
+                      char is_alive) {
+    if (is_alive) {
+        return;
+    }
+    ctrl->offset.offset = offset;
+}
+
+static really_inline
+void repeatStoreLast(union RepeatControl *ctrl, u64a offset,
+                     UNUSED char is_alive) {
+    assert(!is_alive || offset >= ctrl->offset.offset);
+    ctrl->offset.offset = offset;
+}
+
+void repeatStoreRing(const struct RepeatInfo *info,
+                     union RepeatControl *ctrl, void *state, u64a offset,
+                     char is_alive);
+
+void repeatStoreRange(const struct RepeatInfo *info,
+                      union RepeatControl *ctrl, void *state, u64a offset,
+                      char is_alive);
+
+void repeatStoreBitmap(const struct RepeatInfo *info,
+                       union RepeatControl *ctrl, u64a offset,
+                       char is_alive);
+
+void repeatStoreSparseOptimalP(const struct RepeatInfo *info,
+                               union RepeatControl *ctrl, void *state,
+                               u64a offset, char is_alive);
+
+void repeatStoreTrailer(const struct RepeatInfo *info,
+                        union RepeatControl *ctrl, u64a offset,
+                        char is_alive);
+
+static really_inline
+void repeatStore(const struct RepeatInfo *info, union RepeatControl *ctrl,
+                 void *state, u64a offset, char is_alive) {
+    assert(info && ctrl && state);
+    assert(ISALIGNED(info));
+    assert(ISALIGNED(ctrl));
+
+    assert(info->repeatMin <= info->repeatMax);
+    assert(info->repeatMax <= REPEAT_INF);
+
+    switch ((enum RepeatType)info->type) {
+    case REPEAT_RING:
+        repeatStoreRing(info, ctrl, state, offset, is_alive);
+        break;
+    case REPEAT_FIRST:
+        repeatStoreFirst(ctrl, offset, is_alive);
+        break;
+    case REPEAT_LAST:
+        repeatStoreLast(ctrl, offset, is_alive);
+        break;
+    case REPEAT_RANGE:
+        repeatStoreRange(info, ctrl, state, offset, is_alive);
+        break;
+    case REPEAT_BITMAP:
+        repeatStoreBitmap(info, ctrl, offset, is_alive);
+        break;
+    case REPEAT_SPARSE_OPTIMAL_P:
+        repeatStoreSparseOptimalP(info, ctrl, state, offset, is_alive);
+        break;
+    case REPEAT_TRAILER:
+        repeatStoreTrailer(info, ctrl, offset, is_alive);
+        break;
+    case REPEAT_ALWAYS:
+        /* nothing to do - no state */
+        break;
+    }
+}
+
+static really_inline
+enum RepeatMatch repeatHasMatchFirst(const struct RepeatInfo *info,
+                                     const union RepeatControl *ctrl,
+                                     u64a offset) {
+    if (offset < ctrl->offset.offset + info->repeatMin) {
+        return REPEAT_NOMATCH;
+    }
+
+    // FIRST models are {N,} repeats, i.e. they always have inf max depth.
+    assert(info->repeatMax == REPEAT_INF);
+    return REPEAT_MATCH;
+}
+
+static really_inline
+enum RepeatMatch repeatHasMatchLast(const struct RepeatInfo *info,
+                                    const union RepeatControl *ctrl,
+                                    u64a offset) {
+    if (offset < ctrl->offset.offset + info->repeatMin) {
+        return REPEAT_NOMATCH;
+    }
+    assert(info->repeatMax < REPEAT_INF);
+    if (offset <= ctrl->offset.offset + info->repeatMax) {
+        return REPEAT_MATCH;
+    }
+    return REPEAT_STALE;
+}
+
+enum RepeatMatch repeatHasMatchRing(const struct RepeatInfo *info,
+                                    const union RepeatControl *ctrl,
+                                    const void *state, u64a offset);
+
+enum RepeatMatch repeatHasMatchRange(const struct RepeatInfo *info,
+                                     const union RepeatControl *ctrl,
+                                     const void *state, u64a offset);
+
+enum RepeatMatch repeatHasMatchSparseOptimalP(const struct RepeatInfo *info,
+                                              const union RepeatControl *ctrl,
+                                              const void *state, u64a offset);
+
+enum RepeatMatch repeatHasMatchBitmap(const struct RepeatInfo *info,
+                                      const union RepeatControl *ctrl,
+                                      u64a offset);
+
+enum RepeatMatch repeatHasMatchTrailer(const struct RepeatInfo *info,
+                                       const union RepeatControl *ctrl,
+                                       u64a offset);
+
+static really_inline
+enum RepeatMatch repeatHasMatch(const struct RepeatInfo *info,
+                                const union RepeatControl *ctrl,
+                                const void *state, u64a offset) {
+    assert(info && ctrl && state);
+    assert(ISALIGNED(info));
+    assert(ISALIGNED(ctrl));
+
+    switch ((enum RepeatType)info->type) {
+    case REPEAT_RING:
+        return repeatHasMatchRing(info, ctrl, state, offset);
+    case REPEAT_FIRST:
+        return repeatHasMatchFirst(info, ctrl, offset);
+    case REPEAT_LAST:
+        return repeatHasMatchLast(info, ctrl, offset);
+    case REPEAT_RANGE:
+        return repeatHasMatchRange(info, ctrl, state, offset);
+    case REPEAT_BITMAP:
+        return repeatHasMatchBitmap(info, ctrl, offset);
+    case REPEAT_SPARSE_OPTIMAL_P:
+        return repeatHasMatchSparseOptimalP(info, ctrl, state, offset);
+    case REPEAT_TRAILER:
+        return repeatHasMatchTrailer(info, ctrl, offset);
+    case REPEAT_ALWAYS:
+        return REPEAT_MATCH;
+    }
+
+    assert(0);
+    return REPEAT_NOMATCH;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // REPEAT_H
diff --git a/regex/nfa/repeat_internal.h b/regex/nfa/repeat_internal.h
new file mode 100644
index 000000000..9e3f455c8
--- /dev/null
+++ b/regex/nfa/repeat_internal.h
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef REPEAT_INTERNAL_H
+#define REPEAT_INTERNAL_H
+
+#include "ue2common.h"
+
+/** \file
+ *   \brief Bounded Repeat models.
+ *
+ *  Used by the NFA, to represent bounded repeats managed via special POS and
+ *  TUG exceptions, and by the LBR (limited bounded repeat) and Castle
+ *  specialist engines.
+ *
+ *  We currently have a number of different kinds of bounded repeat model, for
+ *  different kinds of {N,M} repeats, described by ::RepeatType.
+ */
+
+/** Different types of bounded repeats. */
+enum RepeatType {
+    /** General mechanism for tracking {N,M} repeats. Stores the first top as
+     * an absolute offset, then subsequent tops in the {N,M} range as a ring of
+     * relative top indices stored in a multibit. */
+    REPEAT_RING,
+
+    /** Used to track {N,} repeats. Uses the \ref RepeatOffsetControl structure,
+     * since only the first top encountered needs to be stored. */
+    REPEAT_FIRST,
+
+    /** Used to track {0,N} repeats. Much like ::REPEAT_FIRST, except that we
+     * store the most recent top encountered. */
+    REPEAT_LAST,
+
+    /** Like ::REPEAT_RING, this is also used for {N,M} repeats, but for cases
+     * where there is a large difference between N and M, and developed to
+     * reduce the state requirements of this case (relative to the RING model).
+     * Uses a small ordered array of top indices relative to \ref
+     * RepeatRangeControl::offset. */
+    REPEAT_RANGE,
+
+    /** Used for {N,M} repeats where 0 < M <= 64. Uses the \ref
+     * RepeatBitmapControl structure at runtime. */
+    REPEAT_BITMAP,
+
+    /** Optimal mechanism for tracking {N,M} repeats when there is a bound on
+     * how frequently they can be retriggered.
+     * Assume f(repeat, min) representing the number of possible bit patterns
+     * we can have for repeat size = repeat,  minimum period = min
+     * We will have the following recurrence relation:
+     * f(repeat, min) = f(repeat - 1, min) + f(repeat - min, min);
+     * We use this recurrence to encode bit patterns with 64-bit values by
+     * referencing a table that stores values from f(0, min) to f(repeat, min)
+     * eg: repeat = 5, min = 2.  10001 => f(4,2) + f(0,2) = 9.
+     * We search the optimal patch size between min and repeat in advance and
+     * use the scheme above to do encoding and decoding to reduce stream state
+     * size. */
+    REPEAT_SPARSE_OPTIMAL_P,
+
+    /** Used for {N,M} repeats where 0 < N < 64. Uses the
+     * \ref RepeatTrailerControl structure at runtime. */
+    REPEAT_TRAILER,
+
+    /** Degenerate repeat that always returns true. Used by castle for pseudo
+     * [^X]* repeats. */
+    REPEAT_ALWAYS,
+};
+
+/**
+ * \brief Value used to represent an unbounded max repeat.
+ *
+ * Note that we do not support \ref RepeatInfo::repeatMax values larger than
+ * this.
+ */
+#define REPEAT_INF 65535
+
+/** Max slots used by ::REPEAT_RANGE repeat model. */
+#define REPEAT_RANGE_MAX_SLOTS 16
+
+/** Structure describing a bounded repeat in the bytecode */
+struct RepeatInfo {
+    u8 type; //!< from enum RepeatType.
+    u32 repeatMin; //!< minimum number of repeats.
+    u32 repeatMax; //!< maximum number of repeats, or REPEAT_INF if unbounded.
+
+    /** Maximum value that is required to be stored in the control block
+     * counters. Any value greater than this will be capped at the horizon.
+     */
+    u32 horizon;
+
+    /** Size of the compressed control block in bytes. This is what is written
+     * out to stream state at stream boundaries. */
+    u32 packedCtrlSize;
+
+    /** Size of the repeat state block in bytes. This is where the REPEAT_RANGE
+     * vector and REPEAT_RING multibit are stored, in stream state, and they
+     * are manipulated directly (i.e. not copied at stream boundaries). */
+    u32 stateSize;
+
+    /** How soon after one trigger we can see the next trigger.
+     * Used by REPEAT_SPARSE_OPTIMAL_P. */
+    u32 minPeriod;
+
+    /** Packed control block field sizes (in bits), used by REPEAT_TRAILER. */
+    u32 packedFieldSizes[2];
+
+    /* Number of patches, used by REPEAT_SPARSE_OPTIMAL_P. */
+    u32 patchCount;
+
+    /* Optimal patch length, used by REPEAT_SPARSE_OPTIMAL_P. */
+    u32 patchSize;
+
+    /* Encoding patch length in bytes, used by REPEAT_SPARSE_OPTIMAL_P. */
+    u32 encodingSize;
+
+    /* RepeatInfo struct length including table size. */
+    u32 length;
+
+    /** Offset of patches relative to the start of repeat stream state,
+     * used by REPEAT_SPARSE_OPTIMAL_P. */
+    u32 patchesOffset;
+};
+
+/** Runtime control block structure for ::REPEAT_RING and
+ * ::REPEAT_SPARSE_OPTIMAL_P bounded repeats. Note that this struct is packed
+ * (may not be aligned). */
+struct RepeatRingControl {
+    u64a offset; //!< index of first top.
+    u16 first; //!< start index in ring.
+    u16 last; //!< end index in ring.
+};
+
+/** Runtime control block structure for ::REPEAT_RANGE bounded repeats. Note
+ * that this struct is packed (may not be aligned). */
+struct RepeatRangeControl {
+    u64a offset; //!< index of first top.
+    u8 num; //!< number of elements in array.
+};
+
+/** Runtime control block structure for cases where only a single offset is
+ * needed to track the repeat, both ::REPEAT_FIRST and ::REPEAT_LAST. Note that
+ * this struct is packed (may not be aligned). */
+struct RepeatOffsetControl {
+    u64a offset; //!< index of a top.
+};
+
+/** Runtime control block structure for ::REPEAT_BITMAP bounded repeats. */
+struct RepeatBitmapControl {
+    u64a offset; //!< index of first top.
+    u64a bitmap; //!< forward bitmap of tops relative to base offset.
+};
+
+/** Runtime control block structure for ::REPEAT_TRAILER bounded repeats. */
+struct RepeatTrailerControl {
+    u64a offset; //!< min extent of most recent match window.
+    u64a bitmap; //!< trailing bitmap of earlier matches, relative to offset.
+};
+
+/** \brief Union of control block types, used at runtime. */
+union RepeatControl {
+    struct RepeatRingControl ring;
+    struct RepeatRangeControl range;
+    struct RepeatOffsetControl offset;
+    struct RepeatBitmapControl bitmap;
+    struct RepeatTrailerControl trailer;
+};
+
+/** For debugging, returns the name of a repeat model. */
+static really_inline UNUSED
+const char *repeatTypeName(u8 type) {
+    switch ((enum RepeatType)type) {
+    case REPEAT_RING:
+        return "RING";
+    case REPEAT_FIRST:
+        return "FIRST";
+    case REPEAT_LAST:
+        return "LAST";
+    case REPEAT_RANGE:
+        return "RANGE";
+    case REPEAT_BITMAP:
+        return "BITMAP";
+    case REPEAT_SPARSE_OPTIMAL_P:
+        return "SPARSE_OPTIMAL_P";
+    case REPEAT_TRAILER:
+        return "TRAILER";
+    case REPEAT_ALWAYS:
+        return "ALWAYS";
+    }
+    assert(0);
+    return "UNKNOWN";
+}
+
+#endif // REPEAT_INTERNAL_H
diff --git a/regex/nfa/sheng.c b/regex/nfa/sheng.c
new file mode 100644
index 000000000..3f36e2189
--- /dev/null
+++ b/regex/nfa/sheng.c
@@ -0,0 +1,1877 @@
+/*
+ * Copyright (c) 2016-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "sheng.h"
+
+#include "accel.h"
+#include "sheng_internal.h"
+#include "nfa_api.h"
+#include "nfa_api_queue.h"
+#include "nfa_internal.h"
+#include "util/bitutils.h"
+#include "util/compare.h"
+#include "util/join.h"
+#include "util/simd_utils.h"
+
+enum MatchMode {
+    CALLBACK_OUTPUT,
+    STOP_AT_MATCH,
+    NO_MATCHES
+};
+
+static really_inline
+const struct sheng *get_sheng(const struct NFA *n) {
+    return (const struct sheng *)getImplNfa(n);
+}
+
+static really_inline
+const struct sstate_aux *get_aux(const struct sheng *sh, u8 id) {
+    u32 offset = sh->aux_offset - sizeof(struct NFA) +
+            (id & SHENG_STATE_MASK) * sizeof(struct sstate_aux);
+    DEBUG_PRINTF("Getting aux for state %u at offset %llu\n",
+                 id & SHENG_STATE_MASK, (u64a)offset + sizeof(struct NFA));
+    return (const struct sstate_aux *)((const char *) sh + offset);
+}
+
+static really_inline
+const union AccelAux *get_accel(const struct sheng *sh, u8 id) {
+    const struct sstate_aux *saux = get_aux(sh, id);
+    DEBUG_PRINTF("Getting accel aux at offset %u\n", saux->accel);
+    const union AccelAux *aux = (const union AccelAux *)
+            ((const char *)sh + saux->accel - sizeof(struct NFA));
+    return aux;
+}
+
+static really_inline
+const struct report_list *get_rl(const struct sheng *sh,
+                                 const struct sstate_aux *aux) {
+    DEBUG_PRINTF("Getting report list at offset %u\n", aux->accept);
+    return (const struct report_list *)
+        ((const char *)sh + aux->accept - sizeof(struct NFA));
+}
+
+static really_inline
+const struct report_list *get_eod_rl(const struct sheng *sh,
+                                     const struct sstate_aux *aux) {
+    DEBUG_PRINTF("Getting EOD report list at offset %u\n", aux->accept);
+    return (const struct report_list *)
+        ((const char *)sh + aux->accept_eod - sizeof(struct NFA));
+}
+
+static really_inline
+char shengHasAccept(const struct sheng *sh, const struct sstate_aux *aux,
+                    ReportID report) {
+    assert(sh && aux);
+
+    const struct report_list *rl = get_rl(sh, aux);
+    assert(ISALIGNED_N(rl, 4));
+
+    DEBUG_PRINTF("report list has %u entries\n", rl->count);
+
+    for (u32 i = 0; i < rl->count; i++) {
+        if (rl->report[i] == report) {
+            DEBUG_PRINTF("reporting %u\n", rl->report[i]);
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
+static really_inline
+char fireSingleReport(NfaCallback cb, void *ctxt, ReportID r, u64a loc) {
+    DEBUG_PRINTF("reporting %u\n", r);
+    if (cb(0, loc, r, ctxt) == MO_HALT_MATCHING) {
+        return MO_HALT_MATCHING; /* termination requested */
+    }
+    return MO_CONTINUE_MATCHING; /* continue execution */
+}
+
+static really_inline
+char fireReports(const struct sheng *sh, NfaCallback cb, void *ctxt,
+                 const u8 state, u64a loc, u8 *const cached_accept_state,
+                 ReportID *const cached_accept_id, char eod) {
+    DEBUG_PRINTF("reporting matches @ %llu\n", loc);
+
+    if (!eod && state == *cached_accept_state) {
+        DEBUG_PRINTF("reporting %u\n", *cached_accept_id);
+        if (cb(0, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+
+        return MO_CONTINUE_MATCHING; /* continue execution */
+    }
+    const struct sstate_aux *aux = get_aux(sh, state);
+    const struct report_list *rl = eod ? get_eod_rl(sh, aux) : get_rl(sh, aux);
+    assert(ISALIGNED(rl));
+
+    DEBUG_PRINTF("report list has %u entries\n", rl->count);
+    u32 count = rl->count;
+
+    if (!eod && count == 1) {
+        *cached_accept_state = state;
+        *cached_accept_id = rl->report[0];
+
+        DEBUG_PRINTF("reporting %u\n", rl->report[0]);
+        if (cb(0, loc, rl->report[0], ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+
+        return MO_CONTINUE_MATCHING; /* continue execution */
+    }
+
+    for (u32 i = 0; i < count; i++) {
+        DEBUG_PRINTF("reporting %u\n", rl->report[i]);
+        if (cb(0, loc, rl->report[i], ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+    }
+    return MO_CONTINUE_MATCHING; /* continue execution */
+}
+
+#if defined(HAVE_AVX512VBMI)
+// Sheng32
+static really_inline
+const struct sheng32 *get_sheng32(const struct NFA *n) {
+    return (const struct sheng32 *)getImplNfa(n);
+}
+
+static really_inline
+const struct sstate_aux *get_aux32(const struct sheng32 *sh, u8 id) {
+    u32 offset = sh->aux_offset - sizeof(struct NFA) +
+            (id & SHENG32_STATE_MASK) * sizeof(struct sstate_aux);
+    DEBUG_PRINTF("Getting aux for state %u at offset %llu\n",
+                 id & SHENG32_STATE_MASK, (u64a)offset + sizeof(struct NFA));
+    return (const struct sstate_aux *)((const char *) sh + offset);
+}
+
+static really_inline
+const union AccelAux *get_accel32(const struct sheng32 *sh, u8 id) {
+    const struct sstate_aux *saux = get_aux32(sh, id);
+    DEBUG_PRINTF("Getting accel aux at offset %u\n", saux->accel);
+    const union AccelAux *aux = (const union AccelAux *)
+            ((const char *)sh + saux->accel - sizeof(struct NFA));
+    return aux;
+}
+
+static really_inline
+const struct report_list *get_rl32(const struct sheng32 *sh,
+                                   const struct sstate_aux *aux) {
+    DEBUG_PRINTF("Getting report list at offset %u\n", aux->accept);
+    return (const struct report_list *)
+        ((const char *)sh + aux->accept - sizeof(struct NFA));
+}
+
+static really_inline
+const struct report_list *get_eod_rl32(const struct sheng32 *sh,
+                                       const struct sstate_aux *aux) {
+    DEBUG_PRINTF("Getting EOD report list at offset %u\n", aux->accept);
+    return (const struct report_list *)
+        ((const char *)sh + aux->accept_eod - sizeof(struct NFA));
+}
+
+static really_inline
+char sheng32HasAccept(const struct sheng32 *sh, const struct sstate_aux *aux,
+                      ReportID report) {
+    assert(sh && aux);
+
+    const struct report_list *rl = get_rl32(sh, aux);
+    assert(ISALIGNED_N(rl, 4));
+
+    DEBUG_PRINTF("report list has %u entries\n", rl->count);
+
+    for (u32 i = 0; i < rl->count; i++) {
+        if (rl->report[i] == report) {
+            DEBUG_PRINTF("reporting %u\n", rl->report[i]);
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
+static really_inline
+char fireReports32(const struct sheng32 *sh, NfaCallback cb, void *ctxt,
+                   const u8 state, u64a loc, u8 *const cached_accept_state,
+                   ReportID *const cached_accept_id, char eod) {
+    DEBUG_PRINTF("reporting matches @ %llu\n", loc);
+
+    if (!eod && state == *cached_accept_state) {
+        DEBUG_PRINTF("reporting %u\n", *cached_accept_id);
+        if (cb(0, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+
+        return MO_CONTINUE_MATCHING; /* continue execution */
+    }
+    const struct sstate_aux *aux = get_aux32(sh, state);
+    const struct report_list *rl = eod ? get_eod_rl32(sh, aux) :
+                                         get_rl32(sh, aux);
+    assert(ISALIGNED(rl));
+
+    DEBUG_PRINTF("report list has %u entries\n", rl->count);
+    u32 count = rl->count;
+
+    if (!eod && count == 1) {
+        *cached_accept_state = state;
+        *cached_accept_id = rl->report[0];
+
+        DEBUG_PRINTF("reporting %u\n", rl->report[0]);
+        if (cb(0, loc, rl->report[0], ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+
+        return MO_CONTINUE_MATCHING; /* continue execution */
+    }
+
+    for (u32 i = 0; i < count; i++) {
+        DEBUG_PRINTF("reporting %u\n", rl->report[i]);
+        if (cb(0, loc, rl->report[i], ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+    }
+    return MO_CONTINUE_MATCHING; /* continue execution */
+}
+
+// Sheng64
+static really_inline
+const struct sheng64 *get_sheng64(const struct NFA *n) {
+    return (const struct sheng64 *)getImplNfa(n);
+}
+
+static really_inline
+const struct sstate_aux *get_aux64(const struct sheng64 *sh, u8 id) {
+    u32 offset = sh->aux_offset - sizeof(struct NFA) +
+            (id & SHENG64_STATE_MASK) * sizeof(struct sstate_aux);
+    DEBUG_PRINTF("Getting aux for state %u at offset %llu\n",
+                 id & SHENG64_STATE_MASK, (u64a)offset + sizeof(struct NFA));
+    return (const struct sstate_aux *)((const char *) sh + offset);
+}
+
+static really_inline
+const struct report_list *get_rl64(const struct sheng64 *sh,
+                                   const struct sstate_aux *aux) {
+    DEBUG_PRINTF("Getting report list at offset %u\n", aux->accept);
+    return (const struct report_list *)
+        ((const char *)sh + aux->accept - sizeof(struct NFA));
+}
+
+static really_inline
+const struct report_list *get_eod_rl64(const struct sheng64 *sh,
+                                       const struct sstate_aux *aux) {
+    DEBUG_PRINTF("Getting EOD report list at offset %u\n", aux->accept);
+    return (const struct report_list *)
+        ((const char *)sh + aux->accept_eod - sizeof(struct NFA));
+}
+
+static really_inline
+char sheng64HasAccept(const struct sheng64 *sh, const struct sstate_aux *aux,
+                      ReportID report) {
+    assert(sh && aux);
+
+    const struct report_list *rl = get_rl64(sh, aux);
+    assert(ISALIGNED_N(rl, 4));
+
+    DEBUG_PRINTF("report list has %u entries\n", rl->count);
+
+    for (u32 i = 0; i < rl->count; i++) {
+        if (rl->report[i] == report) {
+            DEBUG_PRINTF("reporting %u\n", rl->report[i]);
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
+static really_inline
+char fireReports64(const struct sheng64 *sh, NfaCallback cb, void *ctxt,
+                   const u8 state, u64a loc, u8 *const cached_accept_state,
+                   ReportID *const cached_accept_id, char eod) {
+    DEBUG_PRINTF("reporting matches @ %llu\n", loc);
+
+    if (!eod && state == *cached_accept_state) {
+        DEBUG_PRINTF("reporting %u\n", *cached_accept_id);
+        if (cb(0, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+
+        return MO_CONTINUE_MATCHING; /* continue execution */
+    }
+    const struct sstate_aux *aux = get_aux64(sh, state);
+    const struct report_list *rl = eod ? get_eod_rl64(sh, aux) :
+                                         get_rl64(sh, aux);
+    assert(ISALIGNED(rl));
+
+    DEBUG_PRINTF("report list has %u entries\n", rl->count);
+    u32 count = rl->count;
+
+    if (!eod && count == 1) {
+        *cached_accept_state = state;
+        *cached_accept_id = rl->report[0];
+
+        DEBUG_PRINTF("reporting %u\n", rl->report[0]);
+        if (cb(0, loc, rl->report[0], ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+
+        return MO_CONTINUE_MATCHING; /* continue execution */
+    }
+
+    for (u32 i = 0; i < count; i++) {
+        DEBUG_PRINTF("reporting %u\n", rl->report[i]);
+        if (cb(0, loc, rl->report[i], ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+    }
+    return MO_CONTINUE_MATCHING; /* continue execution */
+}
+#endif // end of HAVE_AVX512VBMI
+
+/* include Sheng function definitions */
+#include "sheng_defs.h"
+
+static really_inline
+char runShengCb(const struct sheng *sh, NfaCallback cb, void *ctxt, u64a offset,
+                u8 *const cached_accept_state, ReportID *const cached_accept_id,
+                const u8 *cur_buf, const u8 *start, const u8 *end, u8 can_die,
+                u8 has_accel, u8 single, const u8 **scanned, u8 *state) {
+    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in callback mode\n",
+                 (u64a)(end - start), offset);
+    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
+                 (s64a)(end - cur_buf));
+    DEBUG_PRINTF("can die: %u has accel: %u single: %u\n", !!can_die,
+                 !!has_accel, !!single);
+    int rv;
+    /* scan and report all matches */
+    if (can_die) {
+        if (has_accel) {
+            rv = sheng4_coda(state, cb, ctxt, sh, cached_accept_state,
+                             cached_accept_id, single, offset, cur_buf, start,
+                             end, scanned);
+        } else {
+            rv = sheng4_cod(state, cb, ctxt, sh, cached_accept_state,
+                            cached_accept_id, single, offset, cur_buf, start,
+                            end, scanned);
+        }
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        rv = sheng_cod(state, cb, ctxt, sh, cached_accept_state,
+                       cached_accept_id, single, offset, cur_buf, *scanned, end,
+                       scanned);
+    } else {
+        if (has_accel) {
+            rv = sheng4_coa(state, cb, ctxt, sh, cached_accept_state,
+                            cached_accept_id, single, offset, cur_buf, start,
+                            end, scanned);
+        } else {
+            rv = sheng4_co(state, cb, ctxt, sh, cached_accept_state,
+                           cached_accept_id, single, offset, cur_buf, start,
+                           end, scanned);
+        }
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        rv = sheng_co(state, cb, ctxt, sh, cached_accept_state,
+                      cached_accept_id, single, offset, cur_buf, *scanned, end,
+                      scanned);
+    }
+    if (rv == MO_HALT_MATCHING) {
+        return MO_DEAD;
+    }
+    return MO_ALIVE;
+}
+
+static really_inline
+void runShengNm(const struct sheng *sh, NfaCallback cb, void *ctxt, u64a offset,
+                u8 *const cached_accept_state, ReportID *const cached_accept_id,
+                const u8 *cur_buf, const u8 *start, const u8 *end, u8 can_die,
+                u8 has_accel, u8 single, const u8 **scanned, u8 *state) {
+    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in nomatch mode\n",
+                 (u64a)(end - start), offset);
+    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
+                 (s64a)(end - cur_buf));
+    DEBUG_PRINTF("can die: %u has accel: %u single: %u\n", !!can_die,
+                 !!has_accel, !!single);
+    /* just scan the buffer */
+    if (can_die) {
+        if (has_accel) {
+            sheng4_nmda(state, cb, ctxt, sh, cached_accept_state,
+                        cached_accept_id, single, offset, cur_buf, start, end,
+                        scanned);
+        } else {
+            sheng4_nmd(state, cb, ctxt, sh, cached_accept_state,
+                       cached_accept_id, single, offset, cur_buf, start, end,
+                       scanned);
+        }
+        sheng_nmd(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
+                  single, offset, cur_buf, *scanned, end, scanned);
+    } else {
+        sheng4_nm(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
+                  single, offset, cur_buf, start, end, scanned);
+        sheng_nm(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
+                 single, offset, cur_buf, *scanned, end, scanned);
+    }
+}
+
+static really_inline
+char runShengSam(const struct sheng *sh, NfaCallback cb, void *ctxt,
+                 u64a offset, u8 *const cached_accept_state,
+                 ReportID *const cached_accept_id, const u8 *cur_buf,
+                 const u8 *start, const u8 *end, u8 can_die, u8 has_accel,
+                 u8 single, const u8 **scanned, u8 *state) {
+    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in stop at match mode\n",
+                 (u64a)(end - start), offset);
+    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
+                 (s64a)(end - cur_buf));
+    DEBUG_PRINTF("can die: %u has accel: %u single: %u\n", !!can_die,
+                 !!has_accel, !!single);
+    int rv;
+    /* scan until first match */
+    if (can_die) {
+        if (has_accel) {
+            rv = sheng4_samda(state, cb, ctxt, sh, cached_accept_state,
+                              cached_accept_id, single, offset, cur_buf, start,
+                              end, scanned);
+        } else {
+            rv = sheng4_samd(state, cb, ctxt, sh, cached_accept_state,
+                             cached_accept_id, single, offset, cur_buf, start,
+                             end, scanned);
+        }
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        /* if we stopped before we expected, we found a match */
+        if (rv == MO_MATCHES_PENDING) {
+            return MO_MATCHES_PENDING;
+        }
+
+        rv = sheng_samd(state, cb, ctxt, sh, cached_accept_state,
+                        cached_accept_id, single, offset, cur_buf, *scanned,
+                        end, scanned);
+    } else {
+        if (has_accel) {
+            rv = sheng4_sama(state, cb, ctxt, sh, cached_accept_state,
+                             cached_accept_id, single, offset, cur_buf, start,
+                             end, scanned);
+        } else {
+            rv = sheng4_sam(state, cb, ctxt, sh, cached_accept_state,
+                            cached_accept_id, single, offset, cur_buf, start,
+                            end, scanned);
+        }
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        /* if we stopped before we expected, we found a match */
+        if (rv == MO_MATCHES_PENDING) {
+            return MO_MATCHES_PENDING;
+        }
+
+        rv = sheng_sam(state, cb, ctxt, sh, cached_accept_state,
+                       cached_accept_id, single, offset, cur_buf, *scanned, end,
+                       scanned);
+    }
+    if (rv == MO_HALT_MATCHING) {
+        return MO_DEAD;
+    }
+    /* if we stopped before we expected, we found a match */
+    if (rv == MO_MATCHES_PENDING) {
+        return MO_MATCHES_PENDING;
+    }
+    return MO_ALIVE;
+}
+
+static never_inline
+char runSheng(const struct sheng *sh, struct mq *q, s64a b_end,
+              enum MatchMode mode) {
+    u8 state = *(u8 *)q->state;
+    u8 can_die = sh->flags & SHENG_FLAG_CAN_DIE;
+    u8 has_accel = sh->flags & SHENG_FLAG_HAS_ACCEL;
+    u8 single = sh->flags & SHENG_FLAG_SINGLE_REPORT;
+
+    u8 cached_accept_state = 0;
+    ReportID cached_accept_id = 0;
+
+    DEBUG_PRINTF("starting Sheng execution in state %u\n",
+                 state & SHENG_STATE_MASK);
+
+    if (q->report_current) {
+        DEBUG_PRINTF("reporting current pending matches\n");
+        assert(sh);
+
+        q->report_current = 0;
+
+        int rv;
+        if (single) {
+            rv = fireSingleReport(q->cb, q->context, sh->report,
+                                  q_cur_offset(q));
+        } else {
+            rv = fireReports(sh, q->cb, q->context, state, q_cur_offset(q),
+                             &cached_accept_state, &cached_accept_id, 0);
+        }
+        if (rv == MO_HALT_MATCHING) {
+            DEBUG_PRINTF("exiting in state %u\n", state & SHENG_STATE_MASK);
+            return MO_DEAD;
+        }
+
+        DEBUG_PRINTF("proceeding with matching\n");
+    }
+
+    assert(q_cur_type(q) == MQE_START);
+    s64a start = q_cur_loc(q);
+
+    DEBUG_PRINTF("offset: %lli, location: %lli, mode: %s\n", q->offset, start,
+                 mode == CALLBACK_OUTPUT ? "CALLBACK OUTPUT" :
+                     mode == NO_MATCHES ? "NO MATCHES" :
+                         mode == STOP_AT_MATCH ? "STOP AT MATCH" : "???");
+
+    DEBUG_PRINTF("processing event @ %lli: %s\n", q->offset + q_cur_loc(q),
+                 q_cur_type(q) == MQE_START ? "START" :
+                     q_cur_type(q) == MQE_TOP ? "TOP" :
+                         q_cur_type(q) == MQE_END ? "END" : "???");
+
+    const u8* cur_buf;
+    if (start < 0) {
+        DEBUG_PRINTF("negative location, scanning history\n");
+        DEBUG_PRINTF("min location: %zd\n", -q->hlength);
+        cur_buf = q->history + q->hlength;
+    } else {
+        DEBUG_PRINTF("positive location, scanning buffer\n");
+        DEBUG_PRINTF("max location: %lli\n", b_end);
+        cur_buf = q->buffer;
+    }
+
+    /* if we our queue event is past our end */
+    if (mode != NO_MATCHES && q_cur_loc(q) > b_end) {
+        DEBUG_PRINTF("current location past buffer end\n");
+        DEBUG_PRINTF("setting q location to %llu\n", b_end);
+        DEBUG_PRINTF("exiting in state %u\n", state & SHENG_STATE_MASK);
+        q->items[q->cur].location = b_end;
+        return MO_ALIVE;
+    }
+
+    q->cur++;
+
+    s64a cur_start = start;
+
+    while (1) {
+        DEBUG_PRINTF("processing event @ %lli: %s\n", q->offset + q_cur_loc(q),
+                     q_cur_type(q) == MQE_START ? "START" :
+                             q_cur_type(q) == MQE_TOP ? "TOP" :
+                                     q_cur_type(q) == MQE_END ? "END" : "???");
+        s64a end = q_cur_loc(q);
+        if (mode != NO_MATCHES) {
+            end = MIN(end, b_end);
+        }
+        assert(end <= (s64a) q->length);
+        s64a cur_end = end;
+
+        /* we may cross the border between history and current buffer */
+        if (cur_start < 0) {
+            cur_end = MIN(0, cur_end);
+        }
+
+        DEBUG_PRINTF("start: %lli end: %lli\n", start, end);
+
+        /* don't scan zero length buffer */
+        if (cur_start != cur_end) {
+            const u8 * scanned = cur_buf;
+            char rv;
+
+            if (mode == NO_MATCHES) {
+                runShengNm(sh, q->cb, q->context, q->offset,
+                           &cached_accept_state, &cached_accept_id, cur_buf,
+                           cur_buf + cur_start, cur_buf + cur_end, can_die,
+                           has_accel, single, &scanned, &state);
+            } else if (mode == CALLBACK_OUTPUT) {
+                rv = runShengCb(sh, q->cb, q->context, q->offset,
+                                &cached_accept_state, &cached_accept_id,
+                                cur_buf, cur_buf + cur_start, cur_buf + cur_end,
+                                can_die, has_accel, single, &scanned, &state);
+                if (rv == MO_DEAD) {
+                    DEBUG_PRINTF("exiting in state %u\n",
+                                 state & SHENG_STATE_MASK);
+                    return MO_DEAD;
+                }
+            } else if (mode == STOP_AT_MATCH) {
+                rv = runShengSam(sh, q->cb, q->context, q->offset,
+                                 &cached_accept_state, &cached_accept_id,
+                                 cur_buf, cur_buf + cur_start,
+                                 cur_buf + cur_end, can_die, has_accel, single,
+                                 &scanned, &state);
+                if (rv == MO_DEAD) {
+                    DEBUG_PRINTF("exiting in state %u\n",
+                                 state & SHENG_STATE_MASK);
+                    return rv;
+                } else if (rv == MO_MATCHES_PENDING) {
+                    assert(q->cur);
+                    DEBUG_PRINTF("found a match, setting q location to %zd\n",
+                                 scanned - cur_buf + 1);
+                    q->cur--;
+                    q->items[q->cur].type = MQE_START;
+                    q->items[q->cur].location =
+                            scanned - cur_buf + 1; /* due to exiting early */
+                    *(u8 *)q->state = state;
+                    DEBUG_PRINTF("exiting in state %u\n",
+                                 state & SHENG_STATE_MASK);
+                    return rv;
+                }
+            } else {
+                assert(!"invalid scanning mode!");
+            }
+            assert(scanned == cur_buf + cur_end);
+
+            cur_start = cur_end;
+        }
+
+        /* if we our queue event is past our end */
+        if (mode != NO_MATCHES && q_cur_loc(q) > b_end) {
+            DEBUG_PRINTF("current location past buffer end\n");
+            DEBUG_PRINTF("setting q location to %llu\n", b_end);
+            DEBUG_PRINTF("exiting in state %u\n", state & SHENG_STATE_MASK);
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = b_end;
+            *(u8 *)q->state = state;
+            return MO_ALIVE;
+        }
+
+        /* crossing over into actual buffer */
+        if (cur_start == 0) {
+            DEBUG_PRINTF("positive location, scanning buffer\n");
+            DEBUG_PRINTF("max offset: %lli\n", b_end);
+            cur_buf = q->buffer;
+        }
+
+        /* continue scanning the same buffer */
+        if (end != cur_end) {
+            continue;
+        }
+
+        switch (q_cur_type(q)) {
+        case MQE_END:
+            *(u8 *)q->state = state;
+            q->cur++;
+            DEBUG_PRINTF("exiting in state %u\n", state & SHENG_STATE_MASK);
+            if (can_die) {
+                return (state & SHENG_STATE_DEAD) ? MO_DEAD : MO_ALIVE;
+            }
+            return MO_ALIVE;
+        case MQE_TOP:
+            if (q->offset + cur_start == 0) {
+                DEBUG_PRINTF("Anchored start, going to state %u\n",
+                             sh->anchored);
+                state = sh->anchored;
+            } else {
+                u8 new_state = get_aux(sh, state)->top;
+                DEBUG_PRINTF("Top event %u->%u\n", state & SHENG_STATE_MASK,
+                             new_state & SHENG_STATE_MASK);
+                state = new_state;
+            }
+            break;
+        default:
+            assert(!"invalid queue event");
+            break;
+        }
+        q->cur++;
+    }
+}
+
+char nfaExecSheng_B(const struct NFA *n, u64a offset, const u8 *buffer,
+                    size_t length, NfaCallback cb, void *context) {
+    DEBUG_PRINTF("smallwrite Sheng\n");
+    assert(n->type == SHENG_NFA);
+    const struct sheng *sh = getImplNfa(n);
+    u8 state = sh->anchored;
+    u8 can_die = sh->flags & SHENG_FLAG_CAN_DIE;
+    u8 has_accel = sh->flags & SHENG_FLAG_HAS_ACCEL;
+    u8 single = sh->flags & SHENG_FLAG_SINGLE_REPORT;
+    u8 cached_accept_state = 0;
+    ReportID cached_accept_id = 0;
+
+    /* scan and report all matches */
+    int rv;
+    s64a end = length;
+    const u8 *scanned;
+
+    rv = runShengCb(sh, cb, context, offset, &cached_accept_state,
+                    &cached_accept_id, buffer, buffer, buffer + end, can_die,
+                    has_accel, single, &scanned, &state);
+    if (rv == MO_DEAD) {
+        DEBUG_PRINTF("exiting in state %u\n",
+                     state & SHENG_STATE_MASK);
+        return MO_DEAD;
+    }
+
+    DEBUG_PRINTF("%u\n", state & SHENG_STATE_MASK);
+
+    const struct sstate_aux *aux = get_aux(sh, state);
+
+    if (aux->accept_eod) {
+        DEBUG_PRINTF("Reporting EOD matches\n");
+        fireReports(sh, cb, context, state, end + offset, &cached_accept_state,
+                    &cached_accept_id, 1);
+    }
+
+    return state & SHENG_STATE_DEAD ? MO_DEAD : MO_ALIVE;
+}
+
+char nfaExecSheng_Q(const struct NFA *n, struct mq *q, s64a end) {
+    const struct sheng *sh = get_sheng(n);
+    char rv = runSheng(sh, q, end, CALLBACK_OUTPUT);
+    return rv;
+}
+
+char nfaExecSheng_Q2(const struct NFA *n, struct mq *q, s64a end) {
+    const struct sheng *sh = get_sheng(n);
+    char rv = runSheng(sh, q, end, STOP_AT_MATCH);
+    return rv;
+}
+
+char nfaExecSheng_QR(const struct NFA *n, struct mq *q, ReportID report) {
+    assert(q_cur_type(q) == MQE_START);
+
+    const struct sheng *sh = get_sheng(n);
+    char rv = runSheng(sh, q, 0 /* end */, NO_MATCHES);
+
+    if (rv && nfaExecSheng_inAccept(n, report, q)) {
+        return MO_MATCHES_PENDING;
+    }
+    return rv;
+}
+
+char nfaExecSheng_inAccept(const struct NFA *n, ReportID report, struct mq *q) {
+    assert(n && q);
+
+    const struct sheng *sh = get_sheng(n);
+    u8 s = *(const u8 *)q->state;
+    DEBUG_PRINTF("checking accepts for %u\n", (u8)(s & SHENG_STATE_MASK));
+
+    const struct sstate_aux *aux = get_aux(sh, s);
+
+    if (!aux->accept) {
+        return 0;
+    }
+
+    return shengHasAccept(sh, aux, report);
+}
+
+char nfaExecSheng_inAnyAccept(const struct NFA *n, struct mq *q) {
+    assert(n && q);
+
+    const struct sheng *sh = get_sheng(n);
+    u8 s = *(const u8 *)q->state;
+    DEBUG_PRINTF("checking accepts for %u\n", (u8)(s & SHENG_STATE_MASK));
+
+    const struct sstate_aux *aux = get_aux(sh, s);
+    return !!aux->accept;
+}
+
+char nfaExecSheng_testEOD(const struct NFA *nfa, const char *state,
+                          UNUSED const char *streamState, u64a offset,
+                          NfaCallback cb, void *ctxt) {
+    assert(nfa);
+
+    const struct sheng *sh = get_sheng(nfa);
+    u8 s = *(const u8 *)state;
+    DEBUG_PRINTF("checking EOD accepts for %u\n", (u8)(s & SHENG_STATE_MASK));
+
+    const struct sstate_aux *aux = get_aux(sh, s);
+
+    if (!aux->accept_eod) {
+        return MO_CONTINUE_MATCHING;
+    }
+
+    return fireReports(sh, cb, ctxt, s, offset, NULL, NULL, 1);
+}
+
+char nfaExecSheng_reportCurrent(const struct NFA *n, struct mq *q) {
+    const struct sheng *sh = (const struct sheng *)getImplNfa(n);
+    NfaCallback cb = q->cb;
+    void *ctxt = q->context;
+    u8 s = *(u8 *)q->state;
+    const struct sstate_aux *aux = get_aux(sh, s);
+    u64a offset = q_cur_offset(q);
+    u8 cached_state_id = 0;
+    ReportID cached_report_id = 0;
+    assert(q_cur_type(q) == MQE_START);
+
+    if (aux->accept) {
+        if (sh->flags & SHENG_FLAG_SINGLE_REPORT) {
+            fireSingleReport(cb, ctxt, sh->report, offset);
+        } else {
+            fireReports(sh, cb, ctxt, s, offset, &cached_state_id,
+                        &cached_report_id, 0);
+        }
+    }
+
+    return 0;
+}
+
+char nfaExecSheng_initCompressedState(const struct NFA *nfa, u64a offset,
+                                      void *state, UNUSED u8 key) {
+    const struct sheng *sh = get_sheng(nfa);
+    u8 *s = (u8 *)state;
+    *s = offset ? sh->floating: sh->anchored;
+    return !(*s & SHENG_STATE_DEAD);
+}
+
+char nfaExecSheng_queueInitState(const struct NFA *nfa, struct mq *q) {
+    assert(nfa->scratchStateSize == 1);
+
+    /* starting in floating state */
+    const struct sheng *sh = get_sheng(nfa);
+    *(u8 *)q->state = sh->floating;
+    DEBUG_PRINTF("starting in floating state\n");
+    return 0;
+}
+
+char nfaExecSheng_queueCompressState(UNUSED const struct NFA *nfa,
+                                     const struct mq *q, UNUSED s64a loc) {
+    void *dest = q->streamState;
+    const void *src = q->state;
+    assert(nfa->scratchStateSize == 1);
+    assert(nfa->streamStateSize == 1);
+    *(u8 *)dest = *(const u8 *)src;
+    return 0;
+}
+
+char nfaExecSheng_expandState(UNUSED const struct NFA *nfa, void *dest,
+                              const void *src, UNUSED u64a offset,
+                              UNUSED u8 key) {
+    assert(nfa->scratchStateSize == 1);
+    assert(nfa->streamStateSize == 1);
+    *(u8 *)dest = *(const u8 *)src;
+    return 0;
+}
+
+#if defined(HAVE_AVX512VBMI)
+// Sheng32
+static really_inline
+char runSheng32Cb(const struct sheng32 *sh, NfaCallback cb, void *ctxt,
+                  u64a offset, u8 *const cached_accept_state,
+                  ReportID *const cached_accept_id, const u8 *cur_buf,
+                  const u8 *start, const u8 *end, u8 can_die,
+                  u8 has_accel, u8 single, const u8 **scanned, u8 *state) {
+    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in callback mode\n",
+                 (u64a)(end - start), offset);
+    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
+                 (s64a)(end - cur_buf));
+    DEBUG_PRINTF("can die: %u has accel: %u single: %u\n", !!can_die,
+                 !!has_accel, !!single);
+    int rv;
+    /* scan and report all matches */
+    if (can_die) {
+        if (has_accel) {
+            rv = sheng32_4_coda(state, cb, ctxt, sh, cached_accept_state,
+                                cached_accept_id, single, offset, cur_buf,
+                                start, end, scanned);
+        } else {
+            rv = sheng32_4_cod(state, cb, ctxt, sh, cached_accept_state,
+                               cached_accept_id, single, offset, cur_buf,
+                               start, end, scanned);
+        }
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        rv = sheng32_cod(state, cb, ctxt, sh, cached_accept_state,
+                         cached_accept_id, single, offset, cur_buf,
+                         *scanned, end, scanned);
+    } else {
+        if (has_accel) {
+            rv = sheng32_4_coa(state, cb, ctxt, sh, cached_accept_state,
+                               cached_accept_id, single, offset, cur_buf,
+                               start, end, scanned);
+        } else {
+            rv = sheng32_4_co(state, cb, ctxt, sh, cached_accept_state,
+                              cached_accept_id, single, offset, cur_buf,
+                              start, end, scanned);
+        }
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        rv = sheng32_co(state, cb, ctxt, sh, cached_accept_state,
+                        cached_accept_id, single, offset, cur_buf,
+                        *scanned, end, scanned);
+    }
+    if (rv == MO_HALT_MATCHING) {
+        return MO_DEAD;
+    }
+    return MO_ALIVE;
+}
+
+static really_inline
+void runSheng32Nm(const struct sheng32 *sh, NfaCallback cb, void *ctxt,
+                  u64a offset, u8 *const cached_accept_state,
+                  ReportID *const cached_accept_id, const u8 *cur_buf,
+                  const u8 *start, const u8 *end, u8 can_die, u8 has_accel,
+                  u8 single, const u8 **scanned, u8 *state) {
+    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in nomatch mode\n",
+                 (u64a)(end - start), offset);
+    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
+                 (s64a)(end - cur_buf));
+    DEBUG_PRINTF("can die: %u has accel: %u single: %u\n", !!can_die,
+                 !!has_accel, !!single);
+    /* just scan the buffer */
+    if (can_die) {
+        if (has_accel) {
+            sheng32_4_nmda(state, cb, ctxt, sh, cached_accept_state,
+                           cached_accept_id, single, offset, cur_buf,
+                           start, end, scanned);
+        } else {
+            sheng32_4_nmd(state, cb, ctxt, sh, cached_accept_state,
+                          cached_accept_id, single, offset, cur_buf,
+                          start, end, scanned);
+        }
+        sheng32_nmd(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
+                    single, offset, cur_buf, *scanned, end, scanned);
+    } else {
+        sheng32_4_nm(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
+                     single, offset, cur_buf, start, end, scanned);
+        sheng32_nm(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
+                   single, offset, cur_buf, *scanned, end, scanned);
+    }
+}
+
+static really_inline
+char runSheng32Sam(const struct sheng32 *sh, NfaCallback cb, void *ctxt,
+                   u64a offset, u8 *const cached_accept_state,
+                   ReportID *const cached_accept_id, const u8 *cur_buf,
+                   const u8 *start, const u8 *end, u8 can_die, u8 has_accel,
+                   u8 single, const u8 **scanned, u8 *state) {
+    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in stop at match mode\n",
+                 (u64a)(end - start), offset);
+    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
+                 (s64a)(end - cur_buf));
+    DEBUG_PRINTF("can die: %u has accel: %u single: %u\n", !!can_die,
+                 !!has_accel, !!single);
+    int rv;
+    /* scan until first match */
+    if (can_die) {
+        if (has_accel) {
+            rv = sheng32_4_samda(state, cb, ctxt, sh, cached_accept_state,
+                                 cached_accept_id, single, offset, cur_buf,
+                                 start, end, scanned);
+        } else {
+            rv = sheng32_4_samd(state, cb, ctxt, sh, cached_accept_state,
+                                cached_accept_id, single, offset, cur_buf,
+                                start, end, scanned);
+        }
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        /* if we stopped before we expected, we found a match */
+        if (rv == MO_MATCHES_PENDING) {
+            return MO_MATCHES_PENDING;
+        }
+
+        rv = sheng32_samd(state, cb, ctxt, sh, cached_accept_state,
+                          cached_accept_id, single, offset, cur_buf,
+                          *scanned, end, scanned);
+    } else {
+        if (has_accel) {
+            rv = sheng32_4_sama(state, cb, ctxt, sh, cached_accept_state,
+                                cached_accept_id, single, offset, cur_buf,
+                                start, end, scanned);
+        } else {
+            rv = sheng32_4_sam(state, cb, ctxt, sh, cached_accept_state,
+                               cached_accept_id, single, offset, cur_buf,
+                               start, end, scanned);
+        }
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        /* if we stopped before we expected, we found a match */
+        if (rv == MO_MATCHES_PENDING) {
+            return MO_MATCHES_PENDING;
+        }
+
+        rv = sheng32_sam(state, cb, ctxt, sh, cached_accept_state,
+                         cached_accept_id, single, offset, cur_buf,
+                         *scanned, end, scanned);
+    }
+    if (rv == MO_HALT_MATCHING) {
+        return MO_DEAD;
+    }
+    /* if we stopped before we expected, we found a match */
+    if (rv == MO_MATCHES_PENDING) {
+        return MO_MATCHES_PENDING;
+    }
+    return MO_ALIVE;
+}
+
+static never_inline
+char runSheng32(const struct sheng32 *sh, struct mq *q, s64a b_end,
+                enum MatchMode mode) {
+    u8 state = *(u8 *)q->state;
+    u8 can_die = sh->flags & SHENG_FLAG_CAN_DIE;
+    u8 has_accel = sh->flags & SHENG_FLAG_HAS_ACCEL;
+    u8 single = sh->flags & SHENG_FLAG_SINGLE_REPORT;
+
+    u8 cached_accept_state = 0;
+    ReportID cached_accept_id = 0;
+
+    DEBUG_PRINTF("starting Sheng32 execution in state %u\n",
+                 state & SHENG32_STATE_MASK);
+
+    if (q->report_current) {
+        DEBUG_PRINTF("reporting current pending matches\n");
+        assert(sh);
+
+        q->report_current = 0;
+
+        int rv;
+        if (single) {
+            rv = fireSingleReport(q->cb, q->context, sh->report,
+                                  q_cur_offset(q));
+        } else {
+            rv = fireReports32(sh, q->cb, q->context, state, q_cur_offset(q),
+                               &cached_accept_state, &cached_accept_id, 0);
+        }
+        if (rv == MO_HALT_MATCHING) {
+            DEBUG_PRINTF("exiting in state %u\n", state & SHENG32_STATE_MASK);
+            return MO_DEAD;
+        }
+
+        DEBUG_PRINTF("proceeding with matching\n");
+    }
+
+    assert(q_cur_type(q) == MQE_START);
+    s64a start = q_cur_loc(q);
+
+    DEBUG_PRINTF("offset: %lli, location: %lli, mode: %s\n", q->offset, start,
+                 mode == CALLBACK_OUTPUT ? "CALLBACK OUTPUT" :
+                     mode == NO_MATCHES ? "NO MATCHES" :
+                         mode == STOP_AT_MATCH ? "STOP AT MATCH" : "???");
+
+    DEBUG_PRINTF("processing event @ %lli: %s\n", q->offset + q_cur_loc(q),
+                 q_cur_type(q) == MQE_START ? "START" :
+                     q_cur_type(q) == MQE_TOP ? "TOP" :
+                         q_cur_type(q) == MQE_END ? "END" : "???");
+
+    const u8* cur_buf;
+    if (start < 0) {
+        DEBUG_PRINTF("negative location, scanning history\n");
+        DEBUG_PRINTF("min location: %zd\n", -q->hlength);
+        cur_buf = q->history + q->hlength;
+    } else {
+        DEBUG_PRINTF("positive location, scanning buffer\n");
+        DEBUG_PRINTF("max location: %lli\n", b_end);
+        cur_buf = q->buffer;
+    }
+
+    /* if we our queue event is past our end */
+    if (mode != NO_MATCHES && q_cur_loc(q) > b_end) {
+        DEBUG_PRINTF("current location past buffer end\n");
+        DEBUG_PRINTF("setting q location to %llu\n", b_end);
+        DEBUG_PRINTF("exiting in state %u\n", state & SHENG32_STATE_MASK);
+        q->items[q->cur].location = b_end;
+        return MO_ALIVE;
+    }
+
+    q->cur++;
+
+    s64a cur_start = start;
+
+    while (1) {
+        DEBUG_PRINTF("processing event @ %lli: %s\n", q->offset + q_cur_loc(q),
+                     q_cur_type(q) == MQE_START ? "START" :
+                             q_cur_type(q) == MQE_TOP ? "TOP" :
+                                     q_cur_type(q) == MQE_END ? "END" : "???");
+        s64a end = q_cur_loc(q);
+        if (mode != NO_MATCHES) {
+            end = MIN(end, b_end);
+        }
+        assert(end <= (s64a) q->length);
+        s64a cur_end = end;
+
+        /* we may cross the border between history and current buffer */
+        if (cur_start < 0) {
+            cur_end = MIN(0, cur_end);
+        }
+
+        DEBUG_PRINTF("start: %lli end: %lli\n", start, end);
+
+        /* don't scan zero length buffer */
+        if (cur_start != cur_end) {
+            const u8 * scanned = cur_buf;
+            char rv;
+
+            if (mode == NO_MATCHES) {
+                runSheng32Nm(sh, q->cb, q->context, q->offset,
+                             &cached_accept_state, &cached_accept_id, cur_buf,
+                             cur_buf + cur_start, cur_buf + cur_end, can_die,
+                             has_accel, single, &scanned, &state);
+            } else if (mode == CALLBACK_OUTPUT) {
+                rv = runSheng32Cb(sh, q->cb, q->context, q->offset,
+                                  &cached_accept_state, &cached_accept_id,
+                                  cur_buf, cur_buf + cur_start, cur_buf + cur_end,
+                                  can_die, has_accel, single, &scanned, &state);
+                if (rv == MO_DEAD) {
+                    DEBUG_PRINTF("exiting in state %u\n",
+                                 state & SHENG32_STATE_MASK);
+                    return MO_DEAD;
+                }
+            } else if (mode == STOP_AT_MATCH) {
+                rv = runSheng32Sam(sh, q->cb, q->context, q->offset,
+                                   &cached_accept_state, &cached_accept_id,
+                                   cur_buf, cur_buf + cur_start,
+                                   cur_buf + cur_end, can_die, has_accel, single,
+                                   &scanned, &state);
+                if (rv == MO_DEAD) {
+                    DEBUG_PRINTF("exiting in state %u\n",
+                                 state & SHENG32_STATE_MASK);
+                    return rv;
+                } else if (rv == MO_MATCHES_PENDING) {
+                    assert(q->cur);
+                    DEBUG_PRINTF("found a match, setting q location to %zd\n",
+                                 scanned - cur_buf + 1);
+                    q->cur--;
+                    q->items[q->cur].type = MQE_START;
+                    q->items[q->cur].location =
+                            scanned - cur_buf + 1; /* due to exiting early */
+                    *(u8 *)q->state = state;
+                    DEBUG_PRINTF("exiting in state %u\n",
+                                 state & SHENG32_STATE_MASK);
+                    return rv;
+                }
+            } else {
+                assert(!"invalid scanning mode!");
+            }
+            assert(scanned == cur_buf + cur_end);
+
+            cur_start = cur_end;
+        }
+
+        /* if we our queue event is past our end */
+        if (mode != NO_MATCHES && q_cur_loc(q) > b_end) {
+            DEBUG_PRINTF("current location past buffer end\n");
+            DEBUG_PRINTF("setting q location to %llu\n", b_end);
+            DEBUG_PRINTF("exiting in state %u\n", state & SHENG32_STATE_MASK);
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = b_end;
+            *(u8 *)q->state = state;
+            return MO_ALIVE;
+        }
+
+        /* crossing over into actual buffer */
+        if (cur_start == 0) {
+            DEBUG_PRINTF("positive location, scanning buffer\n");
+            DEBUG_PRINTF("max offset: %lli\n", b_end);
+            cur_buf = q->buffer;
+        }
+
+        /* continue scanning the same buffer */
+        if (end != cur_end) {
+            continue;
+        }
+
+        switch (q_cur_type(q)) {
+        case MQE_END:
+            *(u8 *)q->state = state;
+            q->cur++;
+            DEBUG_PRINTF("exiting in state %u\n", state & SHENG32_STATE_MASK);
+            if (can_die) {
+                return (state & SHENG32_STATE_DEAD) ? MO_DEAD : MO_ALIVE;
+            }
+            return MO_ALIVE;
+        case MQE_TOP:
+            if (q->offset + cur_start == 0) {
+                DEBUG_PRINTF("Anchored start, going to state %u\n",
+                             sh->anchored);
+                state = sh->anchored;
+            } else {
+                u8 new_state = get_aux32(sh, state)->top;
+                DEBUG_PRINTF("Top event %u->%u\n", state & SHENG32_STATE_MASK,
+                             new_state & SHENG32_STATE_MASK);
+                state = new_state;
+            }
+            break;
+        default:
+            assert(!"invalid queue event");
+            break;
+        }
+        q->cur++;
+    }
+}
+
+char nfaExecSheng32_B(const struct NFA *n, u64a offset, const u8 *buffer,
+                      size_t length, NfaCallback cb, void *context) {
+    DEBUG_PRINTF("smallwrite Sheng32\n");
+    assert(n->type == SHENG_NFA_32);
+    const struct sheng32 *sh = getImplNfa(n);
+    u8 state = sh->anchored;
+    u8 can_die = sh->flags & SHENG_FLAG_CAN_DIE;
+    u8 has_accel = sh->flags & SHENG_FLAG_HAS_ACCEL;
+    u8 single = sh->flags & SHENG_FLAG_SINGLE_REPORT;
+    u8 cached_accept_state = 0;
+    ReportID cached_accept_id = 0;
+
+    /* scan and report all matches */
+    int rv;
+    s64a end = length;
+    const u8 *scanned;
+
+    rv = runSheng32Cb(sh, cb, context, offset, &cached_accept_state,
+                      &cached_accept_id, buffer, buffer, buffer + end, can_die,
+                      has_accel, single, &scanned, &state);
+    if (rv == MO_DEAD) {
+        DEBUG_PRINTF("exiting in state %u\n",
+                     state & SHENG32_STATE_MASK);
+        return MO_DEAD;
+    }
+
+    DEBUG_PRINTF("%u\n", state & SHENG32_STATE_MASK);
+
+    const struct sstate_aux *aux = get_aux32(sh, state);
+
+    if (aux->accept_eod) {
+        DEBUG_PRINTF("Reporting EOD matches\n");
+        fireReports32(sh, cb, context, state, end + offset,
+                      &cached_accept_state, &cached_accept_id, 1);
+    }
+
+    return state & SHENG32_STATE_DEAD ? MO_DEAD : MO_ALIVE;
+}
+
+char nfaExecSheng32_Q(const struct NFA *n, struct mq *q, s64a end) {
+    const struct sheng32 *sh = get_sheng32(n);
+    char rv = runSheng32(sh, q, end, CALLBACK_OUTPUT);
+    return rv;
+}
+
+char nfaExecSheng32_Q2(const struct NFA *n, struct mq *q, s64a end) {
+    const struct sheng32 *sh = get_sheng32(n);
+    char rv = runSheng32(sh, q, end, STOP_AT_MATCH);
+    return rv;
+}
+
+char nfaExecSheng32_QR(const struct NFA *n, struct mq *q, ReportID report) {
+    assert(q_cur_type(q) == MQE_START);
+
+    const struct sheng32 *sh = get_sheng32(n);
+    char rv = runSheng32(sh, q, 0 /* end */, NO_MATCHES);
+
+    if (rv && nfaExecSheng32_inAccept(n, report, q)) {
+        return MO_MATCHES_PENDING;
+    }
+    return rv;
+}
+
+char nfaExecSheng32_inAccept(const struct NFA *n, ReportID report,
+                             struct mq *q) {
+    assert(n && q);
+
+    const struct sheng32 *sh = get_sheng32(n);
+    u8 s = *(const u8 *)q->state;
+    DEBUG_PRINTF("checking accepts for %u\n", (u8)(s & SHENG32_STATE_MASK));
+
+    const struct sstate_aux *aux = get_aux32(sh, s);
+
+    if (!aux->accept) {
+        return 0;
+    }
+
+    return sheng32HasAccept(sh, aux, report);
+}
+
+char nfaExecSheng32_inAnyAccept(const struct NFA *n, struct mq *q) {
+    assert(n && q);
+
+    const struct sheng32 *sh = get_sheng32(n);
+    u8 s = *(const u8 *)q->state;
+    DEBUG_PRINTF("checking accepts for %u\n", (u8)(s & SHENG32_STATE_MASK));
+
+    const struct sstate_aux *aux = get_aux32(sh, s);
+    return !!aux->accept;
+}
+
+char nfaExecSheng32_testEOD(const struct NFA *nfa, const char *state,
+                            UNUSED const char *streamState, u64a offset,
+                            NfaCallback cb, void *ctxt) {
+    assert(nfa);
+
+    const struct sheng32 *sh = get_sheng32(nfa);
+    u8 s = *(const u8 *)state;
+    DEBUG_PRINTF("checking EOD accepts for %u\n", (u8)(s & SHENG32_STATE_MASK));
+
+    const struct sstate_aux *aux = get_aux32(sh, s);
+
+    if (!aux->accept_eod) {
+        return MO_CONTINUE_MATCHING;
+    }
+
+    return fireReports32(sh, cb, ctxt, s, offset, NULL, NULL, 1);
+}
+
+char nfaExecSheng32_reportCurrent(const struct NFA *n, struct mq *q) {
+    const struct sheng32 *sh = (const struct sheng32 *)getImplNfa(n);
+    NfaCallback cb = q->cb;
+    void *ctxt = q->context;
+    u8 s = *(u8 *)q->state;
+    const struct sstate_aux *aux = get_aux32(sh, s);
+    u64a offset = q_cur_offset(q);
+    u8 cached_state_id = 0;
+    ReportID cached_report_id = 0;
+    assert(q_cur_type(q) == MQE_START);
+
+    if (aux->accept) {
+        if (sh->flags & SHENG_FLAG_SINGLE_REPORT) {
+            fireSingleReport(cb, ctxt, sh->report, offset);
+        } else {
+            fireReports32(sh, cb, ctxt, s, offset, &cached_state_id,
+                          &cached_report_id, 0);
+        }
+    }
+
+    return 0;
+}
+
+char nfaExecSheng32_initCompressedState(const struct NFA *nfa, u64a offset,
+                                        void *state, UNUSED u8 key) {
+    const struct sheng32 *sh = get_sheng32(nfa);
+    u8 *s = (u8 *)state;
+    *s = offset ? sh->floating: sh->anchored;
+    return !(*s & SHENG32_STATE_DEAD);
+}
+
+char nfaExecSheng32_queueInitState(const struct NFA *nfa, struct mq *q) {
+    assert(nfa->scratchStateSize == 1);
+
+    /* starting in floating state */
+    const struct sheng32 *sh = get_sheng32(nfa);
+    *(u8 *)q->state = sh->floating;
+    DEBUG_PRINTF("starting in floating state\n");
+    return 0;
+}
+
+char nfaExecSheng32_queueCompressState(UNUSED const struct NFA *nfa,
+                                       const struct mq *q, UNUSED s64a loc) {
+    void *dest = q->streamState;
+    const void *src = q->state;
+    assert(nfa->scratchStateSize == 1);
+    assert(nfa->streamStateSize == 1);
+    *(u8 *)dest = *(const u8 *)src;
+    return 0;
+}
+
+char nfaExecSheng32_expandState(UNUSED const struct NFA *nfa, void *dest,
+                                const void *src, UNUSED u64a offset,
+                                UNUSED u8 key) {
+    assert(nfa->scratchStateSize == 1);
+    assert(nfa->streamStateSize == 1);
+    *(u8 *)dest = *(const u8 *)src;
+    return 0;
+}
+
+// Sheng64
+static really_inline
+char runSheng64Cb(const struct sheng64 *sh, NfaCallback cb, void *ctxt,
+                  u64a offset, u8 *const cached_accept_state,
+                  ReportID *const cached_accept_id, const u8 *cur_buf,
+                  const u8 *start, const u8 *end, u8 can_die,
+                  u8 single, const u8 **scanned, u8 *state) {
+    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in callback mode\n",
+                 (u64a)(end - start), offset);
+    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
+                 (s64a)(end - cur_buf));
+    DEBUG_PRINTF("can die: %u single: %u\n", !!can_die, !!single);
+    int rv;
+    /* scan and report all matches */
+    if (can_die) {
+        rv = sheng64_4_cod(state, cb, ctxt, sh, cached_accept_state,
+                           cached_accept_id, single, offset, cur_buf,
+                           start, end, scanned);
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        rv = sheng64_cod(state, cb, ctxt, sh, cached_accept_state,
+                         cached_accept_id, single, offset, cur_buf,
+                         *scanned, end, scanned);
+    } else {
+        rv = sheng64_4_co(state, cb, ctxt, sh, cached_accept_state,
+                          cached_accept_id, single, offset, cur_buf,
+                          start, end, scanned);
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        rv = sheng64_co(state, cb, ctxt, sh, cached_accept_state,
+                        cached_accept_id, single, offset, cur_buf,
+                        *scanned, end, scanned);
+    }
+    if (rv == MO_HALT_MATCHING) {
+        return MO_DEAD;
+    }
+    return MO_ALIVE;
+}
+
+static really_inline
+void runSheng64Nm(const struct sheng64 *sh, NfaCallback cb, void *ctxt,
+                  u64a offset, u8 *const cached_accept_state,
+                  ReportID *const cached_accept_id, const u8 *cur_buf,
+                  const u8 *start, const u8 *end, u8 can_die,
+                  u8 single, const u8 **scanned, u8 *state) {
+    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in nomatch mode\n",
+                 (u64a)(end - start), offset);
+    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
+                 (s64a)(end - cur_buf));
+    DEBUG_PRINTF("can die: %u single: %u\n", !!can_die, !!single);
+    /* just scan the buffer */
+    if (can_die) {
+        sheng64_4_nmd(state, cb, ctxt, sh, cached_accept_state,
+                      cached_accept_id, single, offset, cur_buf,
+                      start, end, scanned);
+        sheng64_nmd(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
+                    single, offset, cur_buf, *scanned, end, scanned);
+    } else {
+        sheng64_4_nm(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
+                     single, offset, cur_buf, start, end, scanned);
+        sheng64_nm(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
+                   single, offset, cur_buf, *scanned, end, scanned);
+    }
+}
+
+static really_inline
+char runSheng64Sam(const struct sheng64 *sh, NfaCallback cb, void *ctxt,
+                   u64a offset, u8 *const cached_accept_state,
+                   ReportID *const cached_accept_id, const u8 *cur_buf,
+                   const u8 *start, const u8 *end, u8 can_die,
+                   u8 single, const u8 **scanned, u8 *state) {
+    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in stop at match mode\n",
+                 (u64a)(end - start), offset);
+    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
+                 (s64a)(end - cur_buf));
+    DEBUG_PRINTF("can die: %u single: %u\n", !!can_die, !!single);
+    int rv;
+    /* scan until first match */
+    if (can_die) {
+        rv = sheng64_4_samd(state, cb, ctxt, sh, cached_accept_state,
+                            cached_accept_id, single, offset, cur_buf,
+                            start, end, scanned);
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        /* if we stopped before we expected, we found a match */
+        if (rv == MO_MATCHES_PENDING) {
+            return MO_MATCHES_PENDING;
+        }
+
+        rv = sheng64_samd(state, cb, ctxt, sh, cached_accept_state,
+                          cached_accept_id, single, offset, cur_buf,
+                          *scanned, end, scanned);
+    } else {
+        rv = sheng64_4_sam(state, cb, ctxt, sh, cached_accept_state,
+                           cached_accept_id, single, offset, cur_buf,
+                           start, end, scanned);
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        /* if we stopped before we expected, we found a match */
+        if (rv == MO_MATCHES_PENDING) {
+            return MO_MATCHES_PENDING;
+        }
+
+        rv = sheng64_sam(state, cb, ctxt, sh, cached_accept_state,
+                         cached_accept_id, single, offset, cur_buf,
+                         *scanned, end, scanned);
+    }
+    if (rv == MO_HALT_MATCHING) {
+        return MO_DEAD;
+    }
+    /* if we stopped before we expected, we found a match */
+    if (rv == MO_MATCHES_PENDING) {
+        return MO_MATCHES_PENDING;
+    }
+    return MO_ALIVE;
+}
+
+static never_inline
+char runSheng64(const struct sheng64 *sh, struct mq *q, s64a b_end,
+                enum MatchMode mode) {
+    u8 state = *(u8 *)q->state;
+    u8 can_die = sh->flags & SHENG_FLAG_CAN_DIE;
+    u8 single = sh->flags & SHENG_FLAG_SINGLE_REPORT;
+
+    u8 cached_accept_state = 0;
+    ReportID cached_accept_id = 0;
+
+    DEBUG_PRINTF("starting Sheng64 execution in state %u\n",
+                 state & SHENG64_STATE_MASK);
+
+    if (q->report_current) {
+        DEBUG_PRINTF("reporting current pending matches\n");
+        assert(sh);
+
+        q->report_current = 0;
+
+        int rv;
+        if (single) {
+            rv = fireSingleReport(q->cb, q->context, sh->report,
+                                  q_cur_offset(q));
+        } else {
+            rv = fireReports64(sh, q->cb, q->context, state, q_cur_offset(q),
+                               &cached_accept_state, &cached_accept_id, 0);
+        }
+        if (rv == MO_HALT_MATCHING) {
+            DEBUG_PRINTF("exiting in state %u\n", state & SHENG64_STATE_MASK);
+            return MO_DEAD;
+        }
+
+        DEBUG_PRINTF("proceeding with matching\n");
+    }
+
+    assert(q_cur_type(q) == MQE_START);
+    s64a start = q_cur_loc(q);
+
+    DEBUG_PRINTF("offset: %lli, location: %lli, mode: %s\n", q->offset, start,
+                 mode == CALLBACK_OUTPUT ? "CALLBACK OUTPUT" :
+                     mode == NO_MATCHES ? "NO MATCHES" :
+                         mode == STOP_AT_MATCH ? "STOP AT MATCH" : "???");
+
+    DEBUG_PRINTF("processing event @ %lli: %s\n", q->offset + q_cur_loc(q),
+                 q_cur_type(q) == MQE_START ? "START" :
+                     q_cur_type(q) == MQE_TOP ? "TOP" :
+                         q_cur_type(q) == MQE_END ? "END" : "???");
+
+    const u8* cur_buf;
+    if (start < 0) {
+        DEBUG_PRINTF("negative location, scanning history\n");
+        DEBUG_PRINTF("min location: %zd\n", -q->hlength);
+        cur_buf = q->history + q->hlength;
+    } else {
+        DEBUG_PRINTF("positive location, scanning buffer\n");
+        DEBUG_PRINTF("max location: %lli\n", b_end);
+        cur_buf = q->buffer;
+    }
+
+    /* if we our queue event is past our end */
+    if (mode != NO_MATCHES && q_cur_loc(q) > b_end) {
+        DEBUG_PRINTF("current location past buffer end\n");
+        DEBUG_PRINTF("setting q location to %llu\n", b_end);
+        DEBUG_PRINTF("exiting in state %u\n", state & SHENG64_STATE_MASK);
+        q->items[q->cur].location = b_end;
+        return MO_ALIVE;
+    }
+
+    q->cur++;
+
+    s64a cur_start = start;
+
+    while (1) {
+        DEBUG_PRINTF("processing event @ %lli: %s\n", q->offset + q_cur_loc(q),
+                     q_cur_type(q) == MQE_START ? "START" :
+                             q_cur_type(q) == MQE_TOP ? "TOP" :
+                                     q_cur_type(q) == MQE_END ? "END" : "???");
+        s64a end = q_cur_loc(q);
+        if (mode != NO_MATCHES) {
+            end = MIN(end, b_end);
+        }
+        assert(end <= (s64a) q->length);
+        s64a cur_end = end;
+
+        /* we may cross the border between history and current buffer */
+        if (cur_start < 0) {
+            cur_end = MIN(0, cur_end);
+        }
+
+        DEBUG_PRINTF("start: %lli end: %lli\n", start, end);
+
+        /* don't scan zero length buffer */
+        if (cur_start != cur_end) {
+            const u8 * scanned = cur_buf;
+            char rv;
+
+            if (mode == NO_MATCHES) {
+                runSheng64Nm(sh, q->cb, q->context, q->offset,
+                             &cached_accept_state, &cached_accept_id, cur_buf,
+                             cur_buf + cur_start, cur_buf + cur_end, can_die,
+                             single, &scanned, &state);
+            } else if (mode == CALLBACK_OUTPUT) {
+                rv = runSheng64Cb(sh, q->cb, q->context, q->offset,
+                                  &cached_accept_state, &cached_accept_id,
+                                  cur_buf, cur_buf + cur_start, cur_buf + cur_end,
+                                  can_die, single, &scanned, &state);
+                if (rv == MO_DEAD) {
+                    DEBUG_PRINTF("exiting in state %u\n",
+                                 state & SHENG64_STATE_MASK);
+                    return MO_DEAD;
+                }
+            } else if (mode == STOP_AT_MATCH) {
+                rv = runSheng64Sam(sh, q->cb, q->context, q->offset,
+                                   &cached_accept_state, &cached_accept_id,
+                                   cur_buf, cur_buf + cur_start,
+                                   cur_buf + cur_end, can_die, single,
+                                   &scanned, &state);
+                if (rv == MO_DEAD) {
+                    DEBUG_PRINTF("exiting in state %u\n",
+                                 state & SHENG64_STATE_MASK);
+                    return rv;
+                } else if (rv == MO_MATCHES_PENDING) {
+                    assert(q->cur);
+                    DEBUG_PRINTF("found a match, setting q location to %zd\n",
+                                 scanned - cur_buf + 1);
+                    q->cur--;
+                    q->items[q->cur].type = MQE_START;
+                    q->items[q->cur].location =
+                            scanned - cur_buf + 1; /* due to exiting early */
+                    *(u8 *)q->state = state;
+                    DEBUG_PRINTF("exiting in state %u\n",
+                                 state & SHENG64_STATE_MASK);
+                    return rv;
+                }
+            } else {
+                assert(!"invalid scanning mode!");
+            }
+            assert(scanned == cur_buf + cur_end);
+
+            cur_start = cur_end;
+        }
+
+        /* if we our queue event is past our end */
+        if (mode != NO_MATCHES && q_cur_loc(q) > b_end) {
+            DEBUG_PRINTF("current location past buffer end\n");
+            DEBUG_PRINTF("setting q location to %llu\n", b_end);
+            DEBUG_PRINTF("exiting in state %u\n", state & SHENG64_STATE_MASK);
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = b_end;
+            *(u8 *)q->state = state;
+            return MO_ALIVE;
+        }
+
+        /* crossing over into actual buffer */
+        if (cur_start == 0) {
+            DEBUG_PRINTF("positive location, scanning buffer\n");
+            DEBUG_PRINTF("max offset: %lli\n", b_end);
+            cur_buf = q->buffer;
+        }
+
+        /* continue scanning the same buffer */
+        if (end != cur_end) {
+            continue;
+        }
+
+        switch (q_cur_type(q)) {
+        case MQE_END:
+            *(u8 *)q->state = state;
+            q->cur++;
+            DEBUG_PRINTF("exiting in state %u\n", state & SHENG64_STATE_MASK);
+            if (can_die) {
+                return (state & SHENG64_STATE_DEAD) ? MO_DEAD : MO_ALIVE;
+            }
+            return MO_ALIVE;
+        case MQE_TOP:
+            if (q->offset + cur_start == 0) {
+                DEBUG_PRINTF("Anchored start, going to state %u\n",
+                             sh->anchored);
+                state = sh->anchored;
+            } else {
+                u8 new_state = get_aux64(sh, state)->top;
+                DEBUG_PRINTF("Top event %u->%u\n", state & SHENG64_STATE_MASK,
+                             new_state & SHENG64_STATE_MASK);
+                state = new_state;
+            }
+            break;
+        default:
+            assert(!"invalid queue event");
+            break;
+        }
+        q->cur++;
+    }
+}
+
+char nfaExecSheng64_B(const struct NFA *n, u64a offset, const u8 *buffer,
+                      size_t length, NfaCallback cb, void *context) {
+    DEBUG_PRINTF("smallwrite Sheng64\n");
+    assert(n->type == SHENG_NFA_64);
+    const struct sheng64 *sh = getImplNfa(n);
+    u8 state = sh->anchored;
+    u8 can_die = sh->flags & SHENG_FLAG_CAN_DIE;
+    u8 single = sh->flags & SHENG_FLAG_SINGLE_REPORT;
+    u8 cached_accept_state = 0;
+    ReportID cached_accept_id = 0;
+
+    /* scan and report all matches */
+    int rv;
+    s64a end = length;
+    const u8 *scanned;
+
+    rv = runSheng64Cb(sh, cb, context, offset, &cached_accept_state,
+                      &cached_accept_id, buffer, buffer, buffer + end, can_die,
+                      single, &scanned, &state);
+    if (rv == MO_DEAD) {
+        DEBUG_PRINTF("exiting in state %u\n",
+                     state & SHENG64_STATE_MASK);
+        return MO_DEAD;
+    }
+
+    DEBUG_PRINTF("%u\n", state & SHENG64_STATE_MASK);
+
+    const struct sstate_aux *aux = get_aux64(sh, state);
+
+    if (aux->accept_eod) {
+        DEBUG_PRINTF("Reporting EOD matches\n");
+        fireReports64(sh, cb, context, state, end + offset,
+                      &cached_accept_state, &cached_accept_id, 1);
+    }
+
+    return state & SHENG64_STATE_DEAD ? MO_DEAD : MO_ALIVE;
+}
+
+char nfaExecSheng64_Q(const struct NFA *n, struct mq *q, s64a end) {
+    const struct sheng64 *sh = get_sheng64(n);
+    char rv = runSheng64(sh, q, end, CALLBACK_OUTPUT);
+    return rv;
+}
+
+char nfaExecSheng64_Q2(const struct NFA *n, struct mq *q, s64a end) {
+    const struct sheng64 *sh = get_sheng64(n);
+    char rv = runSheng64(sh, q, end, STOP_AT_MATCH);
+    return rv;
+}
+
+char nfaExecSheng64_QR(const struct NFA *n, struct mq *q, ReportID report) {
+    assert(q_cur_type(q) == MQE_START);
+
+    const struct sheng64 *sh = get_sheng64(n);
+    char rv = runSheng64(sh, q, 0 /* end */, NO_MATCHES);
+
+    if (rv && nfaExecSheng64_inAccept(n, report, q)) {
+        return MO_MATCHES_PENDING;
+    }
+    return rv;
+}
+
+char nfaExecSheng64_inAccept(const struct NFA *n, ReportID report,
+                             struct mq *q) {
+    assert(n && q);
+
+    const struct sheng64 *sh = get_sheng64(n);
+    u8 s = *(const u8 *)q->state;
+    DEBUG_PRINTF("checking accepts for %u\n", (u8)(s & SHENG64_STATE_MASK));
+
+    const struct sstate_aux *aux = get_aux64(sh, s);
+
+    if (!aux->accept) {
+        return 0;
+    }
+
+    return sheng64HasAccept(sh, aux, report);
+}
+
+char nfaExecSheng64_inAnyAccept(const struct NFA *n, struct mq *q) {
+    assert(n && q);
+
+    const struct sheng64 *sh = get_sheng64(n);
+    u8 s = *(const u8 *)q->state;
+    DEBUG_PRINTF("checking accepts for %u\n", (u8)(s & SHENG64_STATE_MASK));
+
+    const struct sstate_aux *aux = get_aux64(sh, s);
+    return !!aux->accept;
+}
+
+char nfaExecSheng64_testEOD(const struct NFA *nfa, const char *state,
+                            UNUSED const char *streamState, u64a offset,
+                            NfaCallback cb, void *ctxt) {
+    assert(nfa);
+
+    const struct sheng64 *sh = get_sheng64(nfa);
+    u8 s = *(const u8 *)state;
+    DEBUG_PRINTF("checking EOD accepts for %u\n", (u8)(s & SHENG64_STATE_MASK));
+
+    const struct sstate_aux *aux = get_aux64(sh, s);
+
+    if (!aux->accept_eod) {
+        return MO_CONTINUE_MATCHING;
+    }
+
+    return fireReports64(sh, cb, ctxt, s, offset, NULL, NULL, 1);
+}
+
+char nfaExecSheng64_reportCurrent(const struct NFA *n, struct mq *q) {
+    const struct sheng64 *sh = (const struct sheng64 *)getImplNfa(n);
+    NfaCallback cb = q->cb;
+    void *ctxt = q->context;
+    u8 s = *(u8 *)q->state;
+    const struct sstate_aux *aux = get_aux64(sh, s);
+    u64a offset = q_cur_offset(q);
+    u8 cached_state_id = 0;
+    ReportID cached_report_id = 0;
+    assert(q_cur_type(q) == MQE_START);
+
+    if (aux->accept) {
+        if (sh->flags & SHENG_FLAG_SINGLE_REPORT) {
+            fireSingleReport(cb, ctxt, sh->report, offset);
+        } else {
+            fireReports64(sh, cb, ctxt, s, offset, &cached_state_id,
+                          &cached_report_id, 0);
+        }
+    }
+
+    return 0;
+}
+
+char nfaExecSheng64_initCompressedState(const struct NFA *nfa, u64a offset,
+                                        void *state, UNUSED u8 key) {
+    const struct sheng64 *sh = get_sheng64(nfa);
+    u8 *s = (u8 *)state;
+    *s = offset ? sh->floating: sh->anchored;
+    return !(*s & SHENG64_STATE_DEAD);
+}
+
+char nfaExecSheng64_queueInitState(const struct NFA *nfa, struct mq *q) {
+    assert(nfa->scratchStateSize == 1);
+
+    /* starting in floating state */
+    const struct sheng64 *sh = get_sheng64(nfa);
+    *(u8 *)q->state = sh->floating;
+    DEBUG_PRINTF("starting in floating state\n");
+    return 0;
+}
+
+char nfaExecSheng64_queueCompressState(UNUSED const struct NFA *nfa,
+                                       const struct mq *q, UNUSED s64a loc) {
+    void *dest = q->streamState;
+    const void *src = q->state;
+    assert(nfa->scratchStateSize == 1);
+    assert(nfa->streamStateSize == 1);
+    *(u8 *)dest = *(const u8 *)src;
+    return 0;
+}
+
+char nfaExecSheng64_expandState(UNUSED const struct NFA *nfa, void *dest,
+                                const void *src, UNUSED u64a offset,
+                                UNUSED u8 key) {
+    assert(nfa->scratchStateSize == 1);
+    assert(nfa->streamStateSize == 1);
+    *(u8 *)dest = *(const u8 *)src;
+    return 0;
+}
+#endif // end of HAVE_AVX512VBMI
diff --git a/regex/nfa/sheng.h b/regex/nfa/sheng.h
new file mode 100644
index 000000000..7b90e3034
--- /dev/null
+++ b/regex/nfa/sheng.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2016-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SHENG_H_
+#define SHENG_H_
+
+#include "callback.h"
+#include "ue2common.h"
+
+struct mq;
+struct NFA;
+
+#define nfaExecSheng_B_Reverse NFA_API_NO_IMPL
+#define nfaExecSheng_zombie_status NFA_API_ZOMBIE_NO_IMPL
+
+char nfaExecSheng_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecSheng_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecSheng_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecSheng_inAccept(const struct NFA *n, ReportID report, struct mq *q);
+char nfaExecSheng_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecSheng_queueInitState(const struct NFA *nfa, struct mq *q);
+char nfaExecSheng_queueCompressState(const struct NFA *nfa, const struct mq *q,
+                                     s64a loc);
+char nfaExecSheng_expandState(const struct NFA *nfa, void *dest,
+                              const void *src, u64a offset, u8 key);
+char nfaExecSheng_initCompressedState(const struct NFA *nfa, u64a offset,
+                                      void *state, u8 key);
+char nfaExecSheng_testEOD(const struct NFA *nfa, const char *state,
+                          const char *streamState, u64a offset,
+                          NfaCallback callback, void *context);
+char nfaExecSheng_reportCurrent(const struct NFA *n, struct mq *q);
+
+char nfaExecSheng_B(const struct NFA *n, u64a offset, const u8 *buffer,
+                    size_t length, NfaCallback cb, void *context);
+
+#if defined(HAVE_AVX512VBMI)
+#define nfaExecSheng32_B_Reverse NFA_API_NO_IMPL
+#define nfaExecSheng32_zombie_status NFA_API_ZOMBIE_NO_IMPL
+
+char nfaExecSheng32_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecSheng32_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecSheng32_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecSheng32_inAccept(const struct NFA *n, ReportID report,
+                             struct mq *q);
+char nfaExecSheng32_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecSheng32_queueInitState(const struct NFA *nfa, struct mq *q);
+char nfaExecSheng32_queueCompressState(const struct NFA *nfa,
+                                       const struct mq *q, s64a loc);
+char nfaExecSheng32_expandState(const struct NFA *nfa, void *dest,
+                                const void *src, u64a offset, u8 key);
+char nfaExecSheng32_initCompressedState(const struct NFA *nfa, u64a offset,
+                                        void *state, u8 key);
+char nfaExecSheng32_testEOD(const struct NFA *nfa, const char *state,
+                            const char *streamState, u64a offset,
+                            NfaCallback callback, void *context);
+char nfaExecSheng32_reportCurrent(const struct NFA *n, struct mq *q);
+
+char nfaExecSheng32_B(const struct NFA *n, u64a offset, const u8 *buffer,
+                      size_t length, NfaCallback cb, void *context);
+
+#define nfaExecSheng64_B_Reverse NFA_API_NO_IMPL
+#define nfaExecSheng64_zombie_status NFA_API_ZOMBIE_NO_IMPL
+
+char nfaExecSheng64_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecSheng64_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecSheng64_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecSheng64_inAccept(const struct NFA *n, ReportID report,
+                             struct mq *q);
+char nfaExecSheng64_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecSheng64_queueInitState(const struct NFA *nfa, struct mq *q);
+char nfaExecSheng64_queueCompressState(const struct NFA *nfa,
+                                       const struct mq *q, s64a loc);
+char nfaExecSheng64_expandState(const struct NFA *nfa, void *dest,
+                                const void *src, u64a offset, u8 key);
+char nfaExecSheng64_initCompressedState(const struct NFA *nfa, u64a offset,
+                                        void *state, u8 key);
+char nfaExecSheng64_testEOD(const struct NFA *nfa, const char *state,
+                            const char *streamState, u64a offset,
+                            NfaCallback callback, void *context);
+char nfaExecSheng64_reportCurrent(const struct NFA *n, struct mq *q);
+
+char nfaExecSheng64_B(const struct NFA *n, u64a offset, const u8 *buffer,
+                      size_t length, NfaCallback cb, void *context);
+
+#else // !HAVE_AVX512VBMI
+
+#define nfaExecSheng32_B_Reverse NFA_API_NO_IMPL
+#define nfaExecSheng32_zombie_status NFA_API_ZOMBIE_NO_IMPL
+#define nfaExecSheng32_Q NFA_API_NO_IMPL
+#define nfaExecSheng32_Q2 NFA_API_NO_IMPL
+#define nfaExecSheng32_QR NFA_API_NO_IMPL
+#define nfaExecSheng32_inAccept NFA_API_NO_IMPL
+#define nfaExecSheng32_inAnyAccept NFA_API_NO_IMPL
+#define nfaExecSheng32_queueInitState NFA_API_NO_IMPL
+#define nfaExecSheng32_queueCompressState NFA_API_NO_IMPL
+#define nfaExecSheng32_expandState NFA_API_NO_IMPL
+#define nfaExecSheng32_initCompressedState NFA_API_NO_IMPL
+#define nfaExecSheng32_testEOD NFA_API_NO_IMPL
+#define nfaExecSheng32_reportCurrent NFA_API_NO_IMPL
+#define nfaExecSheng32_B NFA_API_NO_IMPL
+
+#define nfaExecSheng64_B_Reverse NFA_API_NO_IMPL
+#define nfaExecSheng64_zombie_status NFA_API_ZOMBIE_NO_IMPL
+#define nfaExecSheng64_Q NFA_API_NO_IMPL
+#define nfaExecSheng64_Q2 NFA_API_NO_IMPL
+#define nfaExecSheng64_QR NFA_API_NO_IMPL
+#define nfaExecSheng64_inAccept NFA_API_NO_IMPL
+#define nfaExecSheng64_inAnyAccept NFA_API_NO_IMPL
+#define nfaExecSheng64_queueInitState NFA_API_NO_IMPL
+#define nfaExecSheng64_queueCompressState NFA_API_NO_IMPL
+#define nfaExecSheng64_expandState NFA_API_NO_IMPL
+#define nfaExecSheng64_initCompressedState NFA_API_NO_IMPL
+#define nfaExecSheng64_testEOD NFA_API_NO_IMPL
+#define nfaExecSheng64_reportCurrent NFA_API_NO_IMPL
+#define nfaExecSheng64_B NFA_API_NO_IMPL
+#endif // end of HAVE_AVX512VBMI
+
+#endif /* SHENG_H_ */
diff --git a/regex/nfa/sheng_defs.h b/regex/nfa/sheng_defs.h
new file mode 100644
index 000000000..390af7522
--- /dev/null
+++ b/regex/nfa/sheng_defs.h
@@ -0,0 +1,754 @@
+/*
+ * Copyright (c) 2016-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SHENG_DEFS_H
+#define SHENG_DEFS_H
+
+/*
+ * Utility functions used by various versions of Sheng engine
+ */
+static really_inline
+u8 isDeadState(const u8 a) {
+    return a & SHENG_STATE_DEAD;
+}
+
+static really_inline
+u8 isAcceptState(const u8 a) {
+    return a & SHENG_STATE_ACCEPT;
+}
+
+static really_inline
+u8 isAccelState(const u8 a) {
+    return a & SHENG_STATE_ACCEL;
+}
+
+static really_inline
+u8 hasInterestingStates(const u8 a, const u8 b, const u8 c, const u8 d) {
+    return (a | b | c | d) & (SHENG_STATE_FLAG_MASK);
+}
+
+#if defined(HAVE_AVX512VBMI)
+static really_inline
+u8 isDeadState32(const u8 a) {
+    return a & SHENG32_STATE_DEAD;
+}
+
+static really_inline
+u8 isAcceptState32(const u8 a) {
+    return a & SHENG32_STATE_ACCEPT;
+}
+
+static really_inline
+u8 isAccelState32(const u8 a) {
+    return a & SHENG32_STATE_ACCEL;
+}
+
+static really_inline
+u8 hasInterestingStates32(const u8 a, const u8 b, const u8 c, const u8 d) {
+    return (a | b | c | d) & (SHENG32_STATE_FLAG_MASK);
+}
+
+static really_inline
+u8 isDeadState64(const u8 a) {
+    return a & SHENG64_STATE_DEAD;
+}
+
+static really_inline
+u8 isAcceptState64(const u8 a) {
+    return a & SHENG64_STATE_ACCEPT;
+}
+
+static really_inline
+u8 hasInterestingStates64(const u8 a, const u8 b, const u8 c, const u8 d) {
+    return (a | b | c | d) & (SHENG64_STATE_FLAG_MASK);
+}
+#endif
+
+/* these functions should be optimized out, used by NO_MATCHES mode */
+static really_inline
+u8 dummyFunc4(UNUSED const u8 a, UNUSED const u8 b, UNUSED const u8 c,
+              UNUSED const u8 d) {
+    return 0;
+}
+
+static really_inline
+u8 dummyFunc(UNUSED const u8 a) {
+    return 0;
+}
+
+/*
+ * Sheng function definitions for single byte loops
+ */
+/* callback output, can die */
+#define SHENG_IMPL sheng_cod
+#define DEAD_FUNC isDeadState
+#define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_cod
+#define DEAD_FUNC32 isDeadState32
+#define ACCEPT_FUNC32 isAcceptState32
+#define SHENG64_IMPL sheng64_cod
+#define DEAD_FUNC64 isDeadState64
+#define ACCEPT_FUNC64 isAcceptState64
+#endif
+#define STOP_AT_MATCH 0
+#include "sheng_impl.h"
+#undef SHENG_IMPL
+#undef DEAD_FUNC
+#undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef DEAD_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
+#undef STOP_AT_MATCH
+
+/* callback output, can't die */
+#define SHENG_IMPL sheng_co
+#define DEAD_FUNC dummyFunc
+#define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_co
+#define DEAD_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 isAcceptState32
+#define SHENG64_IMPL sheng64_co
+#define DEAD_FUNC64 dummyFunc
+#define ACCEPT_FUNC64 isAcceptState64
+#endif
+#define STOP_AT_MATCH 0
+#include "sheng_impl.h"
+#undef SHENG_IMPL
+#undef DEAD_FUNC
+#undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef DEAD_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
+#undef STOP_AT_MATCH
+
+/* stop at match, can die */
+#define SHENG_IMPL sheng_samd
+#define DEAD_FUNC isDeadState
+#define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_samd
+#define DEAD_FUNC32 isDeadState32
+#define ACCEPT_FUNC32 isAcceptState32
+#define SHENG64_IMPL sheng64_samd
+#define DEAD_FUNC64 isDeadState64
+#define ACCEPT_FUNC64 isAcceptState64
+#endif
+#define STOP_AT_MATCH 1
+#include "sheng_impl.h"
+#undef SHENG_IMPL
+#undef DEAD_FUNC
+#undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef DEAD_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
+#undef STOP_AT_MATCH
+
+/* stop at match, can't die */
+#define SHENG_IMPL sheng_sam
+#define DEAD_FUNC dummyFunc
+#define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_sam
+#define DEAD_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 isAcceptState32
+#define SHENG64_IMPL sheng64_sam
+#define DEAD_FUNC64 dummyFunc
+#define ACCEPT_FUNC64 isAcceptState64
+#endif
+#define STOP_AT_MATCH 1
+#include "sheng_impl.h"
+#undef SHENG_IMPL
+#undef DEAD_FUNC
+#undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef DEAD_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
+#undef STOP_AT_MATCH
+
+/* no match, can die */
+#define SHENG_IMPL sheng_nmd
+#define DEAD_FUNC isDeadState
+#define ACCEPT_FUNC dummyFunc
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_nmd
+#define DEAD_FUNC32 isDeadState32
+#define ACCEPT_FUNC32 dummyFunc
+#define SHENG64_IMPL sheng64_nmd
+#define DEAD_FUNC64 isDeadState64
+#define ACCEPT_FUNC64 dummyFunc
+#endif
+#define STOP_AT_MATCH 0
+#include "sheng_impl.h"
+#undef SHENG_IMPL
+#undef DEAD_FUNC
+#undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef DEAD_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
+#undef STOP_AT_MATCH
+
+/* no match, can't die */
+#define SHENG_IMPL sheng_nm
+#define DEAD_FUNC dummyFunc
+#define ACCEPT_FUNC dummyFunc
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_nm
+#define DEAD_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 dummyFunc
+#define SHENG64_IMPL sheng64_nm
+#define DEAD_FUNC64 dummyFunc
+#define ACCEPT_FUNC64 dummyFunc
+#endif
+#define STOP_AT_MATCH 0
+#include "sheng_impl.h"
+#undef SHENG_IMPL
+#undef DEAD_FUNC
+#undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef DEAD_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
+#undef STOP_AT_MATCH
+
+/*
+ * Sheng function definitions for 4-byte loops
+ */
+/* callback output, can die, accelerated */
+#define SHENG_IMPL sheng4_coda
+#define INTERESTING_FUNC hasInterestingStates
+#define INNER_DEAD_FUNC isDeadState
+#define OUTER_DEAD_FUNC dummyFunc
+#define INNER_ACCEL_FUNC isAccelState
+#define OUTER_ACCEL_FUNC dummyFunc
+#define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_4_coda
+#define INTERESTING_FUNC32 hasInterestingStates32
+#define INNER_DEAD_FUNC32 isDeadState32
+#define OUTER_DEAD_FUNC32 dummyFunc
+#define INNER_ACCEL_FUNC32 isAccelState32
+#define OUTER_ACCEL_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 isAcceptState32
+#define NO_SHENG64_IMPL
+#endif
+#define STOP_AT_MATCH 0
+#include "sheng_impl4.h"
+#undef SHENG_IMPL
+#undef INTERESTING_FUNC
+#undef INNER_DEAD_FUNC
+#undef OUTER_DEAD_FUNC
+#undef INNER_ACCEL_FUNC
+#undef OUTER_ACCEL_FUNC
+#undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef INTERESTING_FUNC32
+#undef INNER_DEAD_FUNC32
+#undef OUTER_DEAD_FUNC32
+#undef INNER_ACCEL_FUNC32
+#undef OUTER_ACCEL_FUNC32
+#undef ACCEPT_FUNC32
+#undef NO_SHENG64_IMPL
+#endif
+#undef STOP_AT_MATCH
+
+/* callback output, can die, not accelerated */
+#define SHENG_IMPL sheng4_cod
+#define INTERESTING_FUNC hasInterestingStates
+#define INNER_DEAD_FUNC isDeadState
+#define OUTER_DEAD_FUNC dummyFunc
+#define INNER_ACCEL_FUNC dummyFunc
+#define OUTER_ACCEL_FUNC dummyFunc
+#define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_4_cod
+#define INTERESTING_FUNC32 hasInterestingStates32
+#define INNER_DEAD_FUNC32 isDeadState32
+#define OUTER_DEAD_FUNC32 dummyFunc
+#define INNER_ACCEL_FUNC32 dummyFunc
+#define OUTER_ACCEL_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 isAcceptState32
+#define SHENG64_IMPL sheng64_4_cod
+#define INTERESTING_FUNC64 hasInterestingStates64
+#define INNER_DEAD_FUNC64 isDeadState64
+#define OUTER_DEAD_FUNC64 dummyFunc
+#define ACCEPT_FUNC64 isAcceptState64
+#endif
+#define STOP_AT_MATCH 0
+#include "sheng_impl4.h"
+#undef SHENG_IMPL
+#undef INTERESTING_FUNC
+#undef INNER_DEAD_FUNC
+#undef OUTER_DEAD_FUNC
+#undef INNER_ACCEL_FUNC
+#undef OUTER_ACCEL_FUNC
+#undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef INTERESTING_FUNC32
+#undef INNER_DEAD_FUNC32
+#undef OUTER_DEAD_FUNC32
+#undef INNER_ACCEL_FUNC32
+#undef OUTER_ACCEL_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef INTERESTING_FUNC64
+#undef INNER_DEAD_FUNC64
+#undef OUTER_DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
+#undef STOP_AT_MATCH
+
+/* callback output, can't die, accelerated */
+#define SHENG_IMPL sheng4_coa
+#define INTERESTING_FUNC hasInterestingStates
+#define INNER_DEAD_FUNC dummyFunc
+#define OUTER_DEAD_FUNC dummyFunc
+#define INNER_ACCEL_FUNC isAccelState
+#define OUTER_ACCEL_FUNC dummyFunc
+#define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_4_coa
+#define INTERESTING_FUNC32 hasInterestingStates32
+#define INNER_DEAD_FUNC32 dummyFunc
+#define OUTER_DEAD_FUNC32 dummyFunc
+#define INNER_ACCEL_FUNC32 isAccelState32
+#define OUTER_ACCEL_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 isAcceptState32
+#define NO_SHENG64_IMPL
+#endif
+#define STOP_AT_MATCH 0
+#include "sheng_impl4.h"
+#undef SHENG_IMPL
+#undef INTERESTING_FUNC
+#undef INNER_DEAD_FUNC
+#undef OUTER_DEAD_FUNC
+#undef INNER_ACCEL_FUNC
+#undef OUTER_ACCEL_FUNC
+#undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef INTERESTING_FUNC32
+#undef INNER_DEAD_FUNC32
+#undef OUTER_DEAD_FUNC32
+#undef INNER_ACCEL_FUNC32
+#undef OUTER_ACCEL_FUNC32
+#undef ACCEPT_FUNC32
+#undef NO_SHENG64_IMPL
+#endif
+#undef STOP_AT_MATCH
+
+/* callback output, can't die, not accelerated */
+#define SHENG_IMPL sheng4_co
+#define INTERESTING_FUNC hasInterestingStates
+#define INNER_DEAD_FUNC dummyFunc
+#define OUTER_DEAD_FUNC dummyFunc
+#define INNER_ACCEL_FUNC dummyFunc
+#define OUTER_ACCEL_FUNC dummyFunc
+#define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_4_co
+#define INTERESTING_FUNC32 hasInterestingStates32
+#define INNER_DEAD_FUNC32 dummyFunc
+#define OUTER_DEAD_FUNC32 dummyFunc
+#define INNER_ACCEL_FUNC32 dummyFunc
+#define OUTER_ACCEL_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 isAcceptState32
+#define SHENG64_IMPL sheng64_4_co
+#define INTERESTING_FUNC64 hasInterestingStates64
+#define INNER_DEAD_FUNC64 dummyFunc
+#define OUTER_DEAD_FUNC64 dummyFunc
+#define ACCEPT_FUNC64 isAcceptState64
+#endif
+#define STOP_AT_MATCH 0
+#include "sheng_impl4.h"
+#undef SHENG_IMPL
+#undef INTERESTING_FUNC
+#undef INNER_DEAD_FUNC
+#undef OUTER_DEAD_FUNC
+#undef INNER_ACCEL_FUNC
+#undef OUTER_ACCEL_FUNC
+#undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef INTERESTING_FUNC32
+#undef INNER_DEAD_FUNC32
+#undef OUTER_DEAD_FUNC32
+#undef INNER_ACCEL_FUNC32
+#undef OUTER_ACCEL_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef INTERESTING_FUNC64
+#undef INNER_DEAD_FUNC64
+#undef OUTER_DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
+#undef STOP_AT_MATCH
+
+/* stop at match, can die, accelerated */
+#define SHENG_IMPL sheng4_samda
+#define INTERESTING_FUNC hasInterestingStates
+#define INNER_DEAD_FUNC isDeadState
+#define OUTER_DEAD_FUNC dummyFunc
+#define INNER_ACCEL_FUNC isAccelState
+#define OUTER_ACCEL_FUNC dummyFunc
+#define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_4_samda
+#define INTERESTING_FUNC32 hasInterestingStates32
+#define INNER_DEAD_FUNC32 isDeadState32
+#define OUTER_DEAD_FUNC32 dummyFunc
+#define INNER_ACCEL_FUNC32 isAccelState32
+#define OUTER_ACCEL_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 isAcceptState32
+#define NO_SHENG64_IMPL
+#endif
+#define STOP_AT_MATCH 1
+#include "sheng_impl4.h"
+#undef SHENG_IMPL
+#undef INTERESTING_FUNC
+#undef INNER_DEAD_FUNC
+#undef OUTER_DEAD_FUNC
+#undef INNER_ACCEL_FUNC
+#undef OUTER_ACCEL_FUNC
+#undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef INTERESTING_FUNC32
+#undef INNER_DEAD_FUNC32
+#undef OUTER_DEAD_FUNC32
+#undef INNER_ACCEL_FUNC32
+#undef OUTER_ACCEL_FUNC32
+#undef ACCEPT_FUNC32
+#undef NO_SHENG64_IMPL
+#endif
+#undef STOP_AT_MATCH
+
+/* stop at match, can die, not accelerated */
+#define SHENG_IMPL sheng4_samd
+#define INTERESTING_FUNC hasInterestingStates
+#define INNER_DEAD_FUNC isDeadState
+#define OUTER_DEAD_FUNC dummyFunc
+#define INNER_ACCEL_FUNC dummyFunc
+#define OUTER_ACCEL_FUNC dummyFunc
+#define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_4_samd
+#define INTERESTING_FUNC32 hasInterestingStates32
+#define INNER_DEAD_FUNC32 isDeadState32
+#define OUTER_DEAD_FUNC32 dummyFunc
+#define INNER_ACCEL_FUNC32 dummyFunc
+#define OUTER_ACCEL_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 isAcceptState32
+#define SHENG64_IMPL sheng64_4_samd
+#define INTERESTING_FUNC64 hasInterestingStates64
+#define INNER_DEAD_FUNC64 isDeadState64
+#define OUTER_DEAD_FUNC64 dummyFunc
+#define ACCEPT_FUNC64 isAcceptState64
+#endif
+#define STOP_AT_MATCH 1
+#include "sheng_impl4.h"
+#undef SHENG_IMPL
+#undef INTERESTING_FUNC
+#undef INNER_DEAD_FUNC
+#undef OUTER_DEAD_FUNC
+#undef INNER_ACCEL_FUNC
+#undef OUTER_ACCEL_FUNC
+#undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef INTERESTING_FUNC32
+#undef INNER_DEAD_FUNC32
+#undef OUTER_DEAD_FUNC32
+#undef INNER_ACCEL_FUNC32
+#undef OUTER_ACCEL_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef INTERESTING_FUNC64
+#undef INNER_DEAD_FUNC64
+#undef OUTER_DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
+#undef STOP_AT_MATCH
+
+/* stop at match, can't die, accelerated */
+#define SHENG_IMPL sheng4_sama
+#define INTERESTING_FUNC hasInterestingStates
+#define INNER_DEAD_FUNC dummyFunc
+#define OUTER_DEAD_FUNC dummyFunc
+#define INNER_ACCEL_FUNC isAccelState
+#define OUTER_ACCEL_FUNC dummyFunc
+#define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_4_sama
+#define INTERESTING_FUNC32 hasInterestingStates32
+#define INNER_DEAD_FUNC32 dummyFunc
+#define OUTER_DEAD_FUNC32 dummyFunc
+#define INNER_ACCEL_FUNC32 isAccelState32
+#define OUTER_ACCEL_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 isAcceptState32
+#define NO_SHENG64_IMPL
+#endif
+#define STOP_AT_MATCH 1
+#include "sheng_impl4.h"
+#undef SHENG_IMPL
+#undef INTERESTING_FUNC
+#undef INNER_DEAD_FUNC
+#undef OUTER_DEAD_FUNC
+#undef INNER_ACCEL_FUNC
+#undef OUTER_ACCEL_FUNC
+#undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef INTERESTING_FUNC32
+#undef INNER_DEAD_FUNC32
+#undef OUTER_DEAD_FUNC32
+#undef INNER_ACCEL_FUNC32
+#undef OUTER_ACCEL_FUNC32
+#undef ACCEPT_FUNC32
+#undef NO_SHENG64_IMPL
+#endif
+#undef STOP_AT_MATCH
+
+/* stop at match, can't die, not accelerated */
+#define SHENG_IMPL sheng4_sam
+#define INTERESTING_FUNC hasInterestingStates
+#define INNER_DEAD_FUNC dummyFunc
+#define OUTER_DEAD_FUNC dummyFunc
+#define INNER_ACCEL_FUNC dummyFunc
+#define OUTER_ACCEL_FUNC dummyFunc
+#define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_4_sam
+#define INTERESTING_FUNC32 hasInterestingStates32
+#define INNER_DEAD_FUNC32 dummyFunc
+#define OUTER_DEAD_FUNC32 dummyFunc
+#define INNER_ACCEL_FUNC32 dummyFunc
+#define OUTER_ACCEL_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 isAcceptState32
+#define SHENG64_IMPL sheng64_4_sam
+#define INTERESTING_FUNC64 hasInterestingStates64
+#define INNER_DEAD_FUNC64 dummyFunc
+#define OUTER_DEAD_FUNC64 dummyFunc
+#define ACCEPT_FUNC64 isAcceptState64
+#endif
+#define STOP_AT_MATCH 1
+#include "sheng_impl4.h"
+#undef SHENG_IMPL
+#undef INTERESTING_FUNC
+#undef INNER_DEAD_FUNC
+#undef OUTER_DEAD_FUNC
+#undef INNER_ACCEL_FUNC
+#undef OUTER_ACCEL_FUNC
+#undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef INTERESTING_FUNC32
+#undef INNER_DEAD_FUNC32
+#undef OUTER_DEAD_FUNC32
+#undef INNER_ACCEL_FUNC32
+#undef OUTER_ACCEL_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef INTERESTING_FUNC64
+#undef INNER_DEAD_FUNC64
+#undef OUTER_DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
+#undef STOP_AT_MATCH
+
+/* no-match have interesting func as dummy, and die/accel checks are outer */
+
+/* no match, can die, accelerated */
+#define SHENG_IMPL sheng4_nmda
+#define INTERESTING_FUNC dummyFunc4
+#define INNER_DEAD_FUNC dummyFunc
+#define OUTER_DEAD_FUNC isDeadState
+#define INNER_ACCEL_FUNC dummyFunc
+#define OUTER_ACCEL_FUNC isAccelState
+#define ACCEPT_FUNC dummyFunc
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_4_nmda
+#define INTERESTING_FUNC32 dummyFunc4
+#define INNER_DEAD_FUNC32 dummyFunc
+#define OUTER_DEAD_FUNC32 isDeadState32
+#define INNER_ACCEL_FUNC32 dummyFunc
+#define OUTER_ACCEL_FUNC32 isAccelState32
+#define ACCEPT_FUNC32 dummyFunc
+#define NO_SHENG64_IMPL
+#endif
+#define STOP_AT_MATCH 0
+#include "sheng_impl4.h"
+#undef SHENG_IMPL
+#undef INTERESTING_FUNC
+#undef INNER_DEAD_FUNC
+#undef OUTER_DEAD_FUNC
+#undef INNER_ACCEL_FUNC
+#undef OUTER_ACCEL_FUNC
+#undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef INTERESTING_FUNC32
+#undef INNER_DEAD_FUNC32
+#undef OUTER_DEAD_FUNC32
+#undef INNER_ACCEL_FUNC32
+#undef OUTER_ACCEL_FUNC32
+#undef ACCEPT_FUNC32
+#undef NO_SHENG64_IMPL
+#endif
+#undef STOP_AT_MATCH
+
+/* no match, can die, not accelerated */
+#define SHENG_IMPL sheng4_nmd
+#define INTERESTING_FUNC dummyFunc4
+#define INNER_DEAD_FUNC dummyFunc
+#define OUTER_DEAD_FUNC isDeadState
+#define INNER_ACCEL_FUNC dummyFunc
+#define OUTER_ACCEL_FUNC dummyFunc
+#define ACCEPT_FUNC dummyFunc
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_4_nmd
+#define INTERESTING_FUNC32 dummyFunc4
+#define INNER_DEAD_FUNC32 dummyFunc
+#define OUTER_DEAD_FUNC32 isDeadState32
+#define INNER_ACCEL_FUNC32 dummyFunc
+#define OUTER_ACCEL_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 dummyFunc
+#define SHENG64_IMPL sheng64_4_nmd
+#define INTERESTING_FUNC64 dummyFunc4
+#define INNER_DEAD_FUNC64 dummyFunc
+#define OUTER_DEAD_FUNC64 isDeadState64
+#define ACCEPT_FUNC64 dummyFunc
+#endif
+#define STOP_AT_MATCH 0
+#include "sheng_impl4.h"
+#undef SHENG_IMPL
+#undef INTERESTING_FUNC
+#undef INNER_DEAD_FUNC
+#undef OUTER_DEAD_FUNC
+#undef INNER_ACCEL_FUNC
+#undef OUTER_ACCEL_FUNC
+#undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef INTERESTING_FUNC32
+#undef INNER_DEAD_FUNC32
+#undef OUTER_DEAD_FUNC32
+#undef INNER_ACCEL_FUNC32
+#undef OUTER_ACCEL_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef INTERESTING_FUNC64
+#undef INNER_DEAD_FUNC64
+#undef OUTER_DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
+#undef STOP_AT_MATCH
+
+/* there is no performance benefit in accelerating a no-match case that can't
+ * die */
+
+/* no match, can't die */
+#define SHENG_IMPL sheng4_nm
+#define INTERESTING_FUNC dummyFunc4
+#define INNER_DEAD_FUNC dummyFunc
+#define OUTER_DEAD_FUNC dummyFunc
+#define INNER_ACCEL_FUNC dummyFunc
+#define OUTER_ACCEL_FUNC dummyFunc
+#define ACCEPT_FUNC dummyFunc
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_4_nm
+#define INTERESTING_FUNC32 dummyFunc4
+#define INNER_DEAD_FUNC32 dummyFunc
+#define OUTER_DEAD_FUNC32 dummyFunc
+#define INNER_ACCEL_FUNC32 dummyFunc
+#define OUTER_ACCEL_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 dummyFunc
+#define SHENG64_IMPL sheng64_4_nm
+#define INTERESTING_FUNC64 dummyFunc4
+#define INNER_DEAD_FUNC64 dummyFunc
+#define OUTER_DEAD_FUNC64 dummyFunc
+#define ACCEPT_FUNC64 dummyFunc
+#endif
+#define STOP_AT_MATCH 0
+#include "sheng_impl4.h"
+#undef SHENG_IMPL
+#undef INTERESTING_FUNC
+#undef INNER_DEAD_FUNC
+#undef OUTER_DEAD_FUNC
+#undef INNER_ACCEL_FUNC
+#undef OUTER_ACCEL_FUNC
+#undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef INTERESTING_FUNC32
+#undef INNER_DEAD_FUNC32
+#undef OUTER_DEAD_FUNC32
+#undef INNER_ACCEL_FUNC32
+#undef OUTER_ACCEL_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef INTERESTING_FUNC64
+#undef INNER_DEAD_FUNC64
+#undef OUTER_DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
+#undef STOP_AT_MATCH
+
+#endif // SHENG_DEFS_H
diff --git a/regex/nfa/sheng_impl.h b/regex/nfa/sheng_impl.h
new file mode 100644
index 000000000..fb8ee1683
--- /dev/null
+++ b/regex/nfa/sheng_impl.h
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2016-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * In order to use this macro, the following things need to be defined:
+ *
+ *  - SHENG_IMPL    (name of the Sheng implementation function)
+ *  - DEAD_FUNC     (name of the function checking for dead states)
+ *  - ACCEPT_FUNC   (name of the function checking for accept state)
+ *  - STOP_AT_MATCH (can be 1 or 0, enable or disable stop at match)
+ */
+
+/* byte-by-byte version. we don't do byte-by-byte death checking as it's
+ * pretty pointless to do it over a buffer that's at most 3 bytes long */
+static really_inline
+char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s,
+                u8 *const cached_accept_state, ReportID *const cached_accept_id,
+                u8 single, u64a base_offset, const u8 *buf, const u8 *start,
+                const u8 *end, const u8 **scan_end) {
+    DEBUG_PRINTF("Starting DFA execution in state %u\n",
+                 *state & SHENG_STATE_MASK);
+    const u8 *cur_buf = start;
+    if (DEAD_FUNC(*state)) {
+        DEBUG_PRINTF("Dead on arrival\n");
+        *scan_end = end;
+        return MO_CONTINUE_MATCHING;
+    }
+    DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start));
+
+    m128 cur_state = set16x8(*state);
+    const m128 *masks = s->shuffle_masks;
+
+    while (likely(cur_buf != end)) {
+        const u8 c = *cur_buf;
+        const m128 shuffle_mask = masks[c];
+        cur_state = pshufb_m128(shuffle_mask, cur_state);
+        const u8 tmp = movd(cur_state);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c, ourisprint(c) ? c : '?');
+        DEBUG_PRINTF("s: %u (hi: %u lo: %u)\n", tmp, (tmp & 0xF0) >> 4,
+                     tmp & 0xF);
+
+        if (unlikely(ACCEPT_FUNC(tmp))) {
+            DEBUG_PRINTF("Accept state %u reached\n", tmp & SHENG_STATE_MASK);
+            u64a match_offset = base_offset + (cur_buf - buf) + 1;
+            DEBUG_PRINTF("Match @ %llu\n", match_offset);
+            if (STOP_AT_MATCH) {
+                DEBUG_PRINTF("Stopping at match @ %lli\n",
+                             (u64a)(cur_buf - start));
+                *state = tmp;
+                *scan_end = cur_buf;
+                return MO_MATCHES_PENDING;
+            }
+            if (single) {
+                if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                    MO_HALT_MATCHING) {
+                    return MO_HALT_MATCHING;
+                }
+            } else {
+                if (fireReports(s, cb, ctxt, tmp, match_offset,
+                                cached_accept_state, cached_accept_id,
+                                0) == MO_HALT_MATCHING) {
+                    return MO_HALT_MATCHING;
+                }
+            }
+        }
+        cur_buf++;
+    }
+    *state = movd(cur_state);
+    *scan_end = cur_buf;
+    return MO_CONTINUE_MATCHING;
+}
+
+#if defined(HAVE_AVX512VBMI)
+static really_inline
+char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt,
+                  const struct sheng32 *s,
+                  u8 *const cached_accept_state,
+                  ReportID *const cached_accept_id,
+                  u8 single, u64a base_offset, const u8 *buf, const u8 *start,
+                  const u8 *end, const u8 **scan_end) {
+    DEBUG_PRINTF("Starting DFA execution in state %u\n",
+                 *state & SHENG32_STATE_MASK);
+    const u8 *cur_buf = start;
+    if (DEAD_FUNC32(*state)) {
+        DEBUG_PRINTF("Dead on arrival\n");
+        *scan_end = end;
+        return MO_CONTINUE_MATCHING;
+    }
+    DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start));
+
+    m512 cur_state = set64x8(*state);
+    const m512 *masks = s->succ_masks;
+
+    while (likely(cur_buf != end)) {
+        const u8 c = *cur_buf;
+        const m512 succ_mask = masks[c];
+        cur_state = vpermb512(cur_state, succ_mask);
+        const u8 tmp = movd512(cur_state);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c, ourisprint(c) ? c : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", tmp & SHENG32_STATE_MASK,
+                     tmp & SHENG32_STATE_FLAG_MASK);
+
+        if (unlikely(ACCEPT_FUNC32(tmp))) {
+            DEBUG_PRINTF("Accept state %u reached\n", tmp & SHENG32_STATE_MASK);
+            u64a match_offset = base_offset + (cur_buf - buf) + 1;
+            DEBUG_PRINTF("Match @ %llu\n", match_offset);
+            if (STOP_AT_MATCH) {
+                DEBUG_PRINTF("Stopping at match @ %lli\n",
+                             (u64a)(cur_buf - start));
+                *state = tmp;
+                *scan_end = cur_buf;
+                return MO_MATCHES_PENDING;
+            }
+            if (single) {
+                if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                    MO_HALT_MATCHING) {
+                    return MO_HALT_MATCHING;
+                }
+            } else {
+                if (fireReports32(s, cb, ctxt, tmp, match_offset,
+                                  cached_accept_state, cached_accept_id,
+                                  0) == MO_HALT_MATCHING) {
+                    return MO_HALT_MATCHING;
+                }
+            }
+        }
+        cur_buf++;
+    }
+    *state = movd512(cur_state);
+    *scan_end = cur_buf;
+    return MO_CONTINUE_MATCHING;
+}
+
+static really_inline
+char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt,
+                  const struct sheng64 *s,
+                  u8 *const cached_accept_state,
+                  ReportID *const cached_accept_id,
+                  u8 single, u64a base_offset, const u8 *buf, const u8 *start,
+                  const u8 *end, const u8 **scan_end) {
+    DEBUG_PRINTF("Starting DFA execution in state %u\n",
+                 *state & SHENG64_STATE_MASK);
+    const u8 *cur_buf = start;
+    if (DEAD_FUNC64(*state)) {
+        DEBUG_PRINTF("Dead on arrival\n");
+        *scan_end = end;
+        return MO_CONTINUE_MATCHING;
+    }
+    DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start));
+
+    m512 cur_state = set64x8(*state);
+    const m512 *masks = s->succ_masks;
+
+    while (likely(cur_buf != end)) {
+        const u8 c = *cur_buf;
+        const m512 succ_mask = masks[c];
+        cur_state = vpermb512(cur_state, succ_mask);
+        const u8 tmp = movd512(cur_state);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c, ourisprint(c) ? c : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", tmp & SHENG64_STATE_MASK,
+                     tmp & SHENG64_STATE_FLAG_MASK);
+
+        if (unlikely(ACCEPT_FUNC64(tmp))) {
+            DEBUG_PRINTF("Accept state %u reached\n", tmp & SHENG64_STATE_MASK);
+            u64a match_offset = base_offset + (cur_buf - buf) + 1;
+            DEBUG_PRINTF("Match @ %llu\n", match_offset);
+            if (STOP_AT_MATCH) {
+                DEBUG_PRINTF("Stopping at match @ %lli\n",
+                             (u64a)(cur_buf - start));
+                *state = tmp;
+                *scan_end = cur_buf;
+                return MO_MATCHES_PENDING;
+            }
+            if (single) {
+                if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                    MO_HALT_MATCHING) {
+                    return MO_HALT_MATCHING;
+                }
+            } else {
+                if (fireReports64(s, cb, ctxt, tmp, match_offset,
+                                  cached_accept_state, cached_accept_id,
+                                  0) == MO_HALT_MATCHING) {
+                    return MO_HALT_MATCHING;
+                }
+            }
+        }
+        cur_buf++;
+    }
+    *state = movd512(cur_state);
+    *scan_end = cur_buf;
+    return MO_CONTINUE_MATCHING;
+}
+#endif
diff --git a/regex/nfa/sheng_impl4.h b/regex/nfa/sheng_impl4.h
new file mode 100644
index 000000000..440e7396e
--- /dev/null
+++ b/regex/nfa/sheng_impl4.h
@@ -0,0 +1,711 @@
+/*
+ * Copyright (c) 2016-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * In order to use this macro, the following things need to be defined:
+ *
+ *  - SHENG_IMPL        (name of the Sheng implementation function)
+ *  - INTERESTING_FUNC  (name of the function checking for accept, accel or dead
+ *                       states)
+ *  - INNER_DEAD_FUNC   (name of the inner function checking for dead states)
+ *  - OUTER_DEAD_FUNC   (name of the outer function checking for dead states)
+ *  - INNER_ACCEL_FUNC  (name of the inner function checking for accel states)
+ *  - OUTER_ACCEL_FUNC  (name of the outer function checking for accel states)
+ *  - ACCEPT_FUNC       (name of the function checking for accept state)
+ *  - STOP_AT_MATCH     (can be 1 or 0, enable or disable stop at match)
+ */
+
+/* unrolled 4-byte-at-a-time version.
+ *
+ * we put innerDeadFunc inside interestingFunc() block so that we don't pay for
+ * dead states checking. however, if interestingFunc is dummy, innerDeadFunc
+ * gets lost with it, so we need an additional check outside the
+ * interestingFunc() branch - it's normally dummy so we don't pay for it, but
+ * when interestingFunc is dummy, outerDeadFunc should be set if we want to
+ * check for dead states.
+ *
+ * also, deadFunc only checks the last known state, but since we can't ever get
+ * out of the dead state and we don't really care where we died, it's not a
+ * problem.
+ */
+static really_inline
+char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s,
+                u8 *const cached_accept_state, ReportID *const cached_accept_id,
+                u8 single, u64a base_offset, const u8 *buf, const u8 *start,
+                const u8 *end, const u8 **scan_end) {
+    DEBUG_PRINTF("Starting DFAx4 execution in state %u\n",
+                 *state & SHENG_STATE_MASK);
+    const u8 *cur_buf = start;
+    const u8 *min_accel_dist = start;
+    base_offset++;
+    DEBUG_PRINTF("Scanning %llu bytes\n", (u64a)(end - start));
+
+    if (INNER_ACCEL_FUNC(*state) || OUTER_ACCEL_FUNC(*state)) {
+        DEBUG_PRINTF("Accel state reached @ 0\n");
+        const union AccelAux *aaux = get_accel(s, *state & SHENG_STATE_MASK);
+        const u8 *new_offset = run_accel(aaux, cur_buf, end);
+        if (new_offset < cur_buf + BAD_ACCEL_DIST) {
+            min_accel_dist = new_offset + BIG_ACCEL_PENALTY;
+        } else {
+            min_accel_dist = new_offset + SMALL_ACCEL_PENALTY;
+        }
+        DEBUG_PRINTF("Next accel chance: %llu\n",
+                     (u64a)(min_accel_dist - start));
+        DEBUG_PRINTF("Accel scanned %zu bytes\n", new_offset - cur_buf);
+        cur_buf = new_offset;
+        DEBUG_PRINTF("New offset: %lli\n", (s64a)(cur_buf - start));
+    }
+    if (INNER_DEAD_FUNC(*state) || OUTER_DEAD_FUNC(*state)) {
+        DEBUG_PRINTF("Dead on arrival\n");
+        *scan_end = end;
+        return MO_CONTINUE_MATCHING;
+    }
+
+    m128 cur_state = set16x8(*state);
+    const m128 *masks = s->shuffle_masks;
+
+    while (likely(end - cur_buf >= 4)) {
+        const u8 *b1 = cur_buf;
+        const u8 *b2 = cur_buf + 1;
+        const u8 *b3 = cur_buf + 2;
+        const u8 *b4 = cur_buf + 3;
+        const u8 c1 = *b1;
+        const u8 c2 = *b2;
+        const u8 c3 = *b3;
+        const u8 c4 = *b4;
+
+        const m128 shuffle_mask1 = masks[c1];
+        cur_state = pshufb_m128(shuffle_mask1, cur_state);
+        const u8 a1 = movd(cur_state);
+
+        const m128 shuffle_mask2 = masks[c2];
+        cur_state = pshufb_m128(shuffle_mask2, cur_state);
+        const u8 a2 = movd(cur_state);
+
+        const m128 shuffle_mask3 = masks[c3];
+        cur_state = pshufb_m128(shuffle_mask3, cur_state);
+        const u8 a3 = movd(cur_state);
+
+        const m128 shuffle_mask4 = masks[c4];
+        cur_state = pshufb_m128(shuffle_mask4, cur_state);
+        const u8 a4 = movd(cur_state);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c1, ourisprint(c1) ? c1 : '?');
+        DEBUG_PRINTF("s: %u (hi: %u lo: %u)\n", a1, (a1 & 0xF0) >> 4, a1 & 0xF);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c2, ourisprint(c2) ? c2 : '?');
+        DEBUG_PRINTF("s: %u (hi: %u lo: %u)\n", a2, (a2 & 0xF0) >> 4, a2 & 0xF);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c3, ourisprint(c3) ? c3 : '?');
+        DEBUG_PRINTF("s: %u (hi: %u lo: %u)\n", a3, (a3 & 0xF0) >> 4, a3 & 0xF);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c4, ourisprint(c4) ? c4 : '?');
+        DEBUG_PRINTF("s: %u (hi: %u lo: %u)\n", a4, (a4 & 0xF0) >> 4, a4 & 0xF);
+
+        if (unlikely(INTERESTING_FUNC(a1, a2, a3, a4))) {
+            if (ACCEPT_FUNC(a1)) {
+                u64a match_offset = base_offset + b1 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a1 & SHENG_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b1 - start));
+                    *scan_end = b1;
+                    *state = a1;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports(s, cb, ctxt, a1, match_offset,
+                                    cached_accept_state, cached_accept_id,
+                                    0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (ACCEPT_FUNC(a2)) {
+                u64a match_offset = base_offset + b2 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a2 & SHENG_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b2 - start));
+                    *scan_end = b2;
+                    *state = a2;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports(s, cb, ctxt, a2, match_offset,
+                                    cached_accept_state, cached_accept_id,
+                                    0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (ACCEPT_FUNC(a3)) {
+                u64a match_offset = base_offset + b3 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a3 & SHENG_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b3 - start));
+                    *scan_end = b3;
+                    *state = a3;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports(s, cb, ctxt, a3, match_offset,
+                                    cached_accept_state, cached_accept_id,
+                                    0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (ACCEPT_FUNC(a4)) {
+                u64a match_offset = base_offset + b4 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a4 & SHENG_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b4 - start));
+                    *scan_end = b4;
+                    *state = a4;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports(s, cb, ctxt, a4, match_offset,
+                                    cached_accept_state, cached_accept_id,
+                                    0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (INNER_DEAD_FUNC(a4)) {
+                DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(b4 - buf));
+                *scan_end = end;
+                *state = a4;
+                return MO_CONTINUE_MATCHING;
+            }
+            if (cur_buf > min_accel_dist && INNER_ACCEL_FUNC(a4)) {
+                DEBUG_PRINTF("Accel state reached @ %lli\n", (s64a)(b4 - buf));
+                const union AccelAux *aaux =
+                    get_accel(s, a4 & SHENG_STATE_MASK);
+                const u8 *new_offset = run_accel(aaux, cur_buf + 4, end);
+                if (new_offset < cur_buf + 4 + BAD_ACCEL_DIST) {
+                    min_accel_dist = new_offset + BIG_ACCEL_PENALTY;
+                } else {
+                    min_accel_dist = new_offset + SMALL_ACCEL_PENALTY;
+                }
+                DEBUG_PRINTF("Next accel chance: %llu\n",
+                             (u64a)(min_accel_dist - start));
+                DEBUG_PRINTF("Accel scanned %llu bytes\n",
+                             (u64a)(new_offset - cur_buf - 4));
+                cur_buf = new_offset;
+                DEBUG_PRINTF("New offset: %llu\n", (u64a)(cur_buf - buf));
+                continue;
+            }
+        }
+        if (OUTER_DEAD_FUNC(a4)) {
+            DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(cur_buf - buf));
+            *scan_end = end;
+            *state = a4;
+            return MO_CONTINUE_MATCHING;
+        };
+        if (cur_buf > min_accel_dist && OUTER_ACCEL_FUNC(a4)) {
+            DEBUG_PRINTF("Accel state reached @ %lli\n", (s64a)(b4 - buf));
+            const union AccelAux *aaux = get_accel(s, a4 & SHENG_STATE_MASK);
+            const u8 *new_offset = run_accel(aaux, cur_buf + 4, end);
+            if (new_offset < cur_buf + 4 + BAD_ACCEL_DIST) {
+                min_accel_dist = new_offset + BIG_ACCEL_PENALTY;
+            } else {
+                min_accel_dist = new_offset + SMALL_ACCEL_PENALTY;
+            }
+            DEBUG_PRINTF("Next accel chance: %llu\n",
+                         (u64a)(min_accel_dist - start));
+            DEBUG_PRINTF("Accel scanned %llu bytes\n",
+                         (u64a)(new_offset - cur_buf - 4));
+            cur_buf = new_offset;
+            DEBUG_PRINTF("New offset: %llu\n", (u64a)(cur_buf - buf));
+            continue;
+        };
+        cur_buf += 4;
+    }
+    *state = movd(cur_state);
+    *scan_end = cur_buf;
+    return MO_CONTINUE_MATCHING;
+}
+
+#if defined(HAVE_AVX512VBMI)
+static really_inline
+char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt,
+                  const struct sheng32 *s,
+                  u8 *const cached_accept_state,
+                  ReportID *const cached_accept_id,
+                  u8 single, u64a base_offset, const u8 *buf, const u8 *start,
+                  const u8 *end, const u8 **scan_end) {
+    DEBUG_PRINTF("Starting DFAx4 execution in state %u\n",
+                 *state & SHENG32_STATE_MASK);
+    const u8 *cur_buf = start;
+    const u8 *min_accel_dist = start;
+    base_offset++;
+    DEBUG_PRINTF("Scanning %llu bytes\n", (u64a)(end - start));
+
+    if (INNER_ACCEL_FUNC32(*state) || OUTER_ACCEL_FUNC32(*state)) {
+        DEBUG_PRINTF("Accel state reached @ 0\n");
+        const union AccelAux *aaux =
+            get_accel32(s, *state & SHENG32_STATE_MASK);
+        const u8 *new_offset = run_accel(aaux, cur_buf, end);
+        if (new_offset < cur_buf + BAD_ACCEL_DIST) {
+            min_accel_dist = new_offset + BIG_ACCEL_PENALTY;
+        } else {
+            min_accel_dist = new_offset + SMALL_ACCEL_PENALTY;
+        }
+        DEBUG_PRINTF("Next accel chance: %llu\n",
+                     (u64a)(min_accel_dist - start));
+        DEBUG_PRINTF("Accel scanned %zu bytes\n", new_offset - cur_buf);
+        cur_buf = new_offset;
+        DEBUG_PRINTF("New offset: %lli\n", (s64a)(cur_buf - start));
+    }
+    if (INNER_DEAD_FUNC32(*state) || OUTER_DEAD_FUNC32(*state)) {
+        DEBUG_PRINTF("Dead on arrival\n");
+        *scan_end = end;
+        return MO_CONTINUE_MATCHING;
+    }
+
+    m512 cur_state = set64x8(*state);
+    const m512 *masks = s->succ_masks;
+
+    while (likely(end - cur_buf >= 4)) {
+        const u8 *b1 = cur_buf;
+        const u8 *b2 = cur_buf + 1;
+        const u8 *b3 = cur_buf + 2;
+        const u8 *b4 = cur_buf + 3;
+        const u8 c1 = *b1;
+        const u8 c2 = *b2;
+        const u8 c3 = *b3;
+        const u8 c4 = *b4;
+
+        const m512 succ_mask1 = masks[c1];
+        cur_state = vpermb512(cur_state, succ_mask1);
+        const u8 a1 = movd512(cur_state);
+
+        const m512 succ_mask2 = masks[c2];
+        cur_state = vpermb512(cur_state, succ_mask2);
+        const u8 a2 = movd512(cur_state);
+
+        const m512 succ_mask3 = masks[c3];
+        cur_state = vpermb512(cur_state, succ_mask3);
+        const u8 a3 = movd512(cur_state);
+
+        const m512 succ_mask4 = masks[c4];
+        cur_state = vpermb512(cur_state, succ_mask4);
+        const u8 a4 = movd512(cur_state);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c1, ourisprint(c1) ? c1 : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", a1 & SHENG32_STATE_MASK,
+                     a1 & SHENG32_STATE_FLAG_MASK);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c2, ourisprint(c2) ? c2 : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", a2 & SHENG32_STATE_MASK,
+                     a2 & SHENG32_STATE_FLAG_MASK);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c3, ourisprint(c3) ? c3 : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", a3 & SHENG32_STATE_MASK,
+                     a3 & SHENG32_STATE_FLAG_MASK);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c4, ourisprint(c4) ? c4 : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", a4 & SHENG32_STATE_MASK,
+                     a4 & SHENG32_STATE_FLAG_MASK);
+
+        if (unlikely(INTERESTING_FUNC32(a1, a2, a3, a4))) {
+            if (ACCEPT_FUNC32(a1)) {
+                u64a match_offset = base_offset + b1 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a1 & SHENG32_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b1 - start));
+                    *scan_end = b1;
+                    *state = a1;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports32(s, cb, ctxt, a1, match_offset,
+                                      cached_accept_state, cached_accept_id,
+                                      0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (ACCEPT_FUNC32(a2)) {
+                u64a match_offset = base_offset + b2 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a2 & SHENG32_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b2 - start));
+                    *scan_end = b2;
+                    *state = a2;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports32(s, cb, ctxt, a2, match_offset,
+                                      cached_accept_state, cached_accept_id,
+                                      0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (ACCEPT_FUNC32(a3)) {
+                u64a match_offset = base_offset + b3 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a3 & SHENG32_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b3 - start));
+                    *scan_end = b3;
+                    *state = a3;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports32(s, cb, ctxt, a3, match_offset,
+                                      cached_accept_state, cached_accept_id,
+                                      0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (ACCEPT_FUNC32(a4)) {
+                u64a match_offset = base_offset + b4 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a4 & SHENG32_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b4 - start));
+                    *scan_end = b4;
+                    *state = a4;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports32(s, cb, ctxt, a4, match_offset,
+                                      cached_accept_state, cached_accept_id,
+                                      0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (INNER_DEAD_FUNC32(a4)) {
+                DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(b4 - buf));
+                *scan_end = end;
+                *state = a4;
+                return MO_CONTINUE_MATCHING;
+            }
+            if (cur_buf > min_accel_dist && INNER_ACCEL_FUNC32(a4)) {
+                DEBUG_PRINTF("Accel state reached @ %lli\n", (s64a)(b4 - buf));
+                const union AccelAux *aaux =
+                    get_accel32(s, a4 & SHENG32_STATE_MASK);
+                const u8 *new_offset = run_accel(aaux, cur_buf + 4, end);
+                if (new_offset < cur_buf + 4 + BAD_ACCEL_DIST) {
+                    min_accel_dist = new_offset + BIG_ACCEL_PENALTY;
+                } else {
+                    min_accel_dist = new_offset + SMALL_ACCEL_PENALTY;
+                }
+                DEBUG_PRINTF("Next accel chance: %llu\n",
+                             (u64a)(min_accel_dist - start));
+                DEBUG_PRINTF("Accel scanned %llu bytes\n",
+                             (u64a)(new_offset - cur_buf - 4));
+                cur_buf = new_offset;
+                DEBUG_PRINTF("New offset: %llu\n", (u64a)(cur_buf - buf));
+                continue;
+            }
+        }
+        if (OUTER_DEAD_FUNC32(a4)) {
+            DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(cur_buf - buf));
+            *scan_end = end;
+            *state = a4;
+            return MO_CONTINUE_MATCHING;
+        };
+        if (cur_buf > min_accel_dist && OUTER_ACCEL_FUNC32(a4)) {
+            DEBUG_PRINTF("Accel state reached @ %lli\n", (s64a)(b4 - buf));
+            const union AccelAux *aaux =
+                get_accel32(s, a4 & SHENG32_STATE_MASK);
+            const u8 *new_offset = run_accel(aaux, cur_buf + 4, end);
+            if (new_offset < cur_buf + 4 + BAD_ACCEL_DIST) {
+                min_accel_dist = new_offset + BIG_ACCEL_PENALTY;
+            } else {
+                min_accel_dist = new_offset + SMALL_ACCEL_PENALTY;
+            }
+            DEBUG_PRINTF("Next accel chance: %llu\n",
+                         (u64a)(min_accel_dist - start));
+            DEBUG_PRINTF("Accel scanned %llu bytes\n",
+                         (u64a)(new_offset - cur_buf - 4));
+            cur_buf = new_offset;
+            DEBUG_PRINTF("New offset: %llu\n", (u64a)(cur_buf - buf));
+            continue;
+        };
+        cur_buf += 4;
+    }
+    *state = movd512(cur_state);
+    *scan_end = cur_buf;
+    return MO_CONTINUE_MATCHING;
+}
+
+#ifndef NO_SHENG64_IMPL
+static really_inline
+char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt,
+                  const struct sheng64 *s,
+                  u8 *const cached_accept_state,
+                  ReportID *const cached_accept_id,
+                  u8 single, u64a base_offset, const u8 *buf, const u8 *start,
+                  const u8 *end, const u8 **scan_end) {
+    DEBUG_PRINTF("Starting DFAx4 execution in state %u\n",
+                 *state & SHENG64_STATE_MASK);
+    const u8 *cur_buf = start;
+    base_offset++;
+    DEBUG_PRINTF("Scanning %llu bytes\n", (u64a)(end - start));
+
+    if (INNER_DEAD_FUNC64(*state) || OUTER_DEAD_FUNC64(*state)) {
+        DEBUG_PRINTF("Dead on arrival\n");
+        *scan_end = end;
+        return MO_CONTINUE_MATCHING;
+    }
+
+    m512 cur_state = set64x8(*state);
+    const m512 *masks = s->succ_masks;
+
+    while (likely(end - cur_buf >= 4)) {
+        const u8 *b1 = cur_buf;
+        const u8 *b2 = cur_buf + 1;
+        const u8 *b3 = cur_buf + 2;
+        const u8 *b4 = cur_buf + 3;
+        const u8 c1 = *b1;
+        const u8 c2 = *b2;
+        const u8 c3 = *b3;
+        const u8 c4 = *b4;
+
+        const m512 succ_mask1 = masks[c1];
+        cur_state = vpermb512(cur_state, succ_mask1);
+        const u8 a1 = movd512(cur_state);
+
+        const m512 succ_mask2 = masks[c2];
+        cur_state = vpermb512(cur_state, succ_mask2);
+        const u8 a2 = movd512(cur_state);
+
+        const m512 succ_mask3 = masks[c3];
+        cur_state = vpermb512(cur_state, succ_mask3);
+        const u8 a3 = movd512(cur_state);
+
+        const m512 succ_mask4 = masks[c4];
+        cur_state = vpermb512(cur_state, succ_mask4);
+        const u8 a4 = movd512(cur_state);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c1, ourisprint(c1) ? c1 : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", a1 & SHENG64_STATE_MASK,
+                     a1 & SHENG64_STATE_FLAG_MASK);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c2, ourisprint(c2) ? c2 : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", a2 & SHENG64_STATE_MASK,
+                     a2 & SHENG64_STATE_FLAG_MASK);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c3, ourisprint(c3) ? c3 : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", a3 & SHENG64_STATE_MASK,
+                     a3 & SHENG64_STATE_FLAG_MASK);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c4, ourisprint(c4) ? c4 : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", a4 & SHENG64_STATE_MASK,
+                     a4 & SHENG64_STATE_FLAG_MASK);
+
+        if (unlikely(INTERESTING_FUNC64(a1, a2, a3, a4))) {
+            if (ACCEPT_FUNC64(a1)) {
+                u64a match_offset = base_offset + b1 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a1 & SHENG64_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b1 - start));
+                    *scan_end = b1;
+                    *state = a1;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports64(s, cb, ctxt, a1, match_offset,
+                                      cached_accept_state, cached_accept_id,
+                                      0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (ACCEPT_FUNC64(a2)) {
+                u64a match_offset = base_offset + b2 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a2 & SHENG64_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b2 - start));
+                    *scan_end = b2;
+                    *state = a2;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports64(s, cb, ctxt, a2, match_offset,
+                                      cached_accept_state, cached_accept_id,
+                                      0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (ACCEPT_FUNC64(a3)) {
+                u64a match_offset = base_offset + b3 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a3 & SHENG64_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b3 - start));
+                    *scan_end = b3;
+                    *state = a3;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports64(s, cb, ctxt, a3, match_offset,
+                                      cached_accept_state, cached_accept_id,
+                                      0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (ACCEPT_FUNC64(a4)) {
+                u64a match_offset = base_offset + b4 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a4 & SHENG64_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b4 - start));
+                    *scan_end = b4;
+                    *state = a4;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports64(s, cb, ctxt, a4, match_offset,
+                                      cached_accept_state, cached_accept_id,
+                                      0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (INNER_DEAD_FUNC64(a4)) {
+                DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(b4 - buf));
+                *scan_end = end;
+                *state = a4;
+                return MO_CONTINUE_MATCHING;
+            }
+        }
+        if (OUTER_DEAD_FUNC64(a4)) {
+            DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(cur_buf - buf));
+            *scan_end = end;
+            *state = a4;
+            return MO_CONTINUE_MATCHING;
+        }
+        cur_buf += 4;
+    }
+    *state = movd512(cur_state);
+    *scan_end = cur_buf;
+    return MO_CONTINUE_MATCHING;
+}
+#endif // !NO_SHENG64_IMPL
+#endif
diff --git a/regex/nfa/sheng_internal.h b/regex/nfa/sheng_internal.h
new file mode 100644
index 000000000..98536886c
--- /dev/null
+++ b/regex/nfa/sheng_internal.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2016-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SHENG_INTERNAL_H_
+#define SHENG_INTERNAL_H_
+
+#include "ue2common.h"
+#include "util/simd_types.h"
+
+#define SHENG_STATE_ACCEPT 0x10
+#define SHENG_STATE_DEAD 0x20
+#define SHENG_STATE_ACCEL 0x40
+#define SHENG_STATE_MASK 0xF
+#define SHENG_STATE_FLAG_MASK 0x70
+
+#define SHENG32_STATE_ACCEPT 0x20
+#define SHENG32_STATE_DEAD 0x40
+#define SHENG32_STATE_ACCEL 0x80
+#define SHENG32_STATE_MASK 0x1F
+#define SHENG32_STATE_FLAG_MASK 0xE0
+
+#define SHENG64_STATE_ACCEPT 0x40
+#define SHENG64_STATE_DEAD 0x80
+#define SHENG64_STATE_MASK 0x3F
+#define SHENG64_STATE_FLAG_MASK 0xC0
+
+#define SHENG_FLAG_SINGLE_REPORT 0x1
+#define SHENG_FLAG_CAN_DIE 0x2
+#define SHENG_FLAG_HAS_ACCEL 0x4
+
+struct report_list {
+    u32 count;
+    ReportID report[];
+};
+
+struct sstate_aux {
+    u32 accept;
+    u32 accept_eod;
+    u32 accel;
+    u32 top;
+};
+
+struct sheng {
+    m128 shuffle_masks[256];
+    u32 length;
+    u32 aux_offset;
+    u32 report_offset;
+    u32 accel_offset;
+    u8 n_states;
+    u8 anchored;
+    u8 floating;
+    u8 flags;
+    ReportID report;
+};
+
+struct sheng32 {
+    m512 succ_masks[256];
+    u32 length;
+    u32 aux_offset;
+    u32 report_offset;
+    u32 accel_offset;
+    u8 n_states;
+    u8 anchored;
+    u8 floating;
+    u8 flags;
+    ReportID report;
+};
+
+struct sheng64 {
+    m512 succ_masks[256];
+    u32 length;
+    u32 aux_offset;
+    u32 report_offset;
+    u32 accel_offset;
+    u8 n_states;
+    u8 anchored;
+    u8 floating;
+    u8 flags;
+    ReportID report;
+};
+
+#endif /* SHENG_INTERNAL_H_ */
diff --git a/regex/nfa/shufti.c b/regex/nfa/shufti.c
new file mode 100644
index 000000000..09ffc0cf9
--- /dev/null
+++ b/regex/nfa/shufti.c
@@ -0,0 +1,1097 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Shufti: character class acceleration.
+ *
+ * Utilises the SSSE3 pshufb shuffle instruction
+ */
+
+#include "shufti.h"
+#include "ue2common.h"
+#include "util/arch.h"
+#include "util/bitutils.h"
+#include "util/simd_utils.h"
+#include "util/unaligned.h"
+
+#ifdef DEBUG
+#include <ctype.h>
+
+#define DUMP_MSK(_t)                                \
+static UNUSED                                       \
+void dumpMsk##_t(m##_t msk) {                       \
+    u8 * mskAsU8 = (u8 *)&msk;                      \
+    for (unsigned i = 0; i < sizeof(msk); i++) {    \
+        u8 c = mskAsU8[i];                          \
+        for (int j = 0; j < 8; j++) {               \
+            if ((c >> (7-j)) & 0x1)                 \
+                printf("1");                        \
+            else                                    \
+                printf("0");                        \
+        }                                           \
+        printf(" ");                                \
+    }                                               \
+}                                                   \
+static UNUSED                                       \
+void dumpMsk##_t##AsChars(m##_t msk) {              \
+    u8 * mskAsU8 = (u8 *)&msk;                      \
+    for (unsigned i = 0; i < sizeof(msk); i++) {    \
+        u8 c = mskAsU8[i];                          \
+        if (isprint(c))                             \
+            printf("%c",c);                         \
+        else                                        \
+            printf(".");                            \
+    }                                               \
+}
+
+#endif
+
+/** \brief Naive byte-by-byte implementation. */
+static really_inline
+const u8 *shuftiFwdSlow(const u8 *lo, const u8 *hi, const u8 *buf,
+                        const u8 *buf_end) {
+    assert(buf < buf_end);
+
+    for (; buf < buf_end; ++buf) {
+        u8 c = *buf;
+        if (lo[c & 0xf] & hi[c >> 4]) {
+            break;
+        }
+    }
+    return buf;
+}
+
+/** \brief Naive byte-by-byte implementation. */
+static really_inline
+const u8 *shuftiRevSlow(const u8 *lo, const u8 *hi, const u8 *buf,
+                        const u8 *buf_end) {
+    assert(buf < buf_end);
+
+    for (buf_end--; buf_end >= buf; buf_end--) {
+        u8 c = *buf_end;
+        if (lo[c & 0xf] & hi[c >> 4]) {
+            break;
+        }
+    }
+    return buf_end;
+}
+
+#if !defined(HAVE_AVX2)
+/* Normal SSSE3 shufti */
+
+#ifdef DEBUG
+DUMP_MSK(128)
+#endif
+
+#define GET_LO_4(chars) and128(chars, low4bits)
+#define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4)
+
+static really_inline
+u32 block(m128 mask_lo, m128 mask_hi, m128 chars, const m128 low4bits,
+          const m128 compare) {
+    m128 c_lo  = pshufb_m128(mask_lo, GET_LO_4(chars));
+    m128 c_hi  = pshufb_m128(mask_hi, GET_HI_4(chars));
+    m128 t     = and128(c_lo, c_hi);
+
+#ifdef DEBUG
+    DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n");
+    DEBUG_PRINTF("  char: "); dumpMsk128(chars);        printf("\n");
+    DEBUG_PRINTF("  c_lo: "); dumpMsk128(c_lo);         printf("\n");
+    DEBUG_PRINTF("  c_hi: "); dumpMsk128(c_hi);         printf("\n");
+    DEBUG_PRINTF("     t: "); dumpMsk128(t);            printf("\n");
+#endif
+    return movemask128(eq128(t, compare));
+}
+
+static really_inline
+const u8 *firstMatch(const u8 *buf, u32 z) {
+    if (unlikely(z != 0xffff)) {
+        u32 pos = ctz32(~z & 0xffff);
+        assert(pos < 16);
+        return buf + pos;
+    } else {
+        return NULL; // no match
+    }
+}
+
+static really_inline
+const u8 *fwdBlock(m128 mask_lo, m128 mask_hi, m128 chars, const u8 *buf,
+                   const m128 low4bits, const m128 zeroes) {
+    u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
+
+    return firstMatch(buf, z);
+}
+
+const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
+                     const u8 *buf_end) {
+    assert(buf && buf_end);
+    assert(buf < buf_end);
+
+    // Slow path for small cases.
+    if (buf_end - buf < 16) {
+        return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
+                             buf, buf_end);
+    }
+
+    const m128 zeroes = zeroes128();
+    const m128 low4bits = _mm_set1_epi8(0xf);
+    const u8 *rv;
+
+    size_t min = (size_t)buf % 16;
+    assert(buf_end - buf >= 16);
+
+    // Preconditioning: most of the time our buffer won't be aligned.
+    m128 chars = loadu128(buf);
+    rv = fwdBlock(mask_lo, mask_hi, chars, buf, low4bits, zeroes);
+    if (rv) {
+        return rv;
+    }
+    buf += (16 - min);
+
+    // Unrolling was here, but it wasn't doing anything but taking up space.
+    // Reroll FTW.
+
+    const u8 *last_block = buf_end - 16;
+    while (buf < last_block) {
+        m128 lchars = load128(buf);
+        rv = fwdBlock(mask_lo, mask_hi, lchars, buf, low4bits, zeroes);
+        if (rv) {
+            return rv;
+        }
+        buf += 16;
+    }
+
+    // Use an unaligned load to mop up the last 16 bytes and get an accurate
+    // picture to buf_end.
+    assert(buf <= buf_end && buf >= buf_end - 16);
+    chars = loadu128(buf_end - 16);
+    rv = fwdBlock(mask_lo, mask_hi, chars, buf_end - 16, low4bits, zeroes);
+    if (rv) {
+        return rv;
+    }
+
+    return buf_end;
+}
+
+static really_inline
+const u8 *lastMatch(const u8 *buf, m128 t, m128 compare) {
+#ifdef DEBUG
+    DEBUG_PRINTF("confirming match in:"); dumpMsk128(t); printf("\n");
+#endif
+
+    u32 z = movemask128(eq128(t, compare));
+    if (unlikely(z != 0xffff)) {
+        u32 pos = clz32(~z & 0xffff);
+        DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos);
+        assert(pos >= 16 && pos < 32);
+        return buf + (31 - pos);
+    } else {
+        return NULL; // no match
+    }
+}
+
+
+static really_inline
+const u8 *revBlock(m128 mask_lo, m128 mask_hi, m128 chars, const u8 *buf,
+                   const m128 low4bits, const m128 zeroes) {
+    m128 c_lo  = pshufb_m128(mask_lo, GET_LO_4(chars));
+    m128 c_hi  = pshufb_m128(mask_hi, GET_HI_4(chars));
+    m128 t     = and128(c_lo, c_hi);
+
+#ifdef DEBUG
+    DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n");
+    DEBUG_PRINTF("  char: "); dumpMsk128(chars);        printf("\n");
+    DEBUG_PRINTF("  c_lo: "); dumpMsk128(c_lo);         printf("\n");
+    DEBUG_PRINTF("  c_hi: "); dumpMsk128(c_hi);         printf("\n");
+    DEBUG_PRINTF("     t: "); dumpMsk128(t);            printf("\n");
+#endif
+
+    return lastMatch(buf, t, zeroes);
+}
+
+const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
+                      const u8 *buf_end) {
+    assert(buf && buf_end);
+    assert(buf < buf_end);
+
+    // Slow path for small cases.
+    if (buf_end - buf < 16) {
+        return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
+                             buf, buf_end);
+    }
+
+    const m128 zeroes = zeroes128();
+    const m128 low4bits = _mm_set1_epi8(0xf);
+    const u8 *rv;
+
+    assert(buf_end - buf >= 16);
+
+    // Preconditioning: most of the time our buffer won't be aligned.
+    m128 chars = loadu128(buf_end - 16);
+    rv = revBlock(mask_lo, mask_hi, chars, buf_end - 16, low4bits, zeroes);
+    if (rv) {
+        return rv;
+    }
+    buf_end = (const u8 *)((size_t)buf_end & ~((size_t)0xf));
+
+    // Unrolling was here, but it wasn't doing anything but taking up space.
+    // Reroll FTW.
+
+    const u8 *last_block = buf + 16;
+    while (buf_end > last_block) {
+        buf_end -= 16;
+        m128 lchars = load128(buf_end);
+        rv = revBlock(mask_lo, mask_hi, lchars, buf_end, low4bits, zeroes);
+        if (rv) {
+            return rv;
+        }
+    }
+
+    // Use an unaligned load to mop up the last 16 bytes and get an accurate
+    // picture to buf.
+    chars = loadu128(buf);
+    rv = revBlock(mask_lo, mask_hi, chars, buf, low4bits, zeroes);
+    if (rv) {
+        return rv;
+    }
+
+    return buf - 1;
+}
+
+static really_inline
+const u8 *fwdBlock2(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi,
+                    m128 chars, const u8 *buf, const m128 low4bits,
+                    const m128 ones) {
+    m128 chars_lo = GET_LO_4(chars);
+    m128 chars_hi = GET_HI_4(chars);
+    m128 c_lo  = pshufb_m128(mask1_lo, chars_lo);
+    m128 c_hi  = pshufb_m128(mask1_hi, chars_hi);
+    m128 t     = or128(c_lo, c_hi);
+
+#ifdef DEBUG
+    DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n");
+    DEBUG_PRINTF("  char: "); dumpMsk128(chars);        printf("\n");
+    DEBUG_PRINTF("  c_lo: "); dumpMsk128(c_lo);         printf("\n");
+    DEBUG_PRINTF("  c_hi: "); dumpMsk128(c_hi);         printf("\n");
+    DEBUG_PRINTF("     t: "); dumpMsk128(t);            printf("\n");
+#endif
+
+    m128 c2_lo  = pshufb_m128(mask2_lo, chars_lo);
+    m128 c2_hi  = pshufb_m128(mask2_hi, chars_hi);
+    m128 t2     = or128(t, rshiftbyte_m128(or128(c2_lo, c2_hi), 1));
+
+#ifdef DEBUG
+    DEBUG_PRINTF(" c2_lo: "); dumpMsk128(c2_lo);        printf("\n");
+    DEBUG_PRINTF(" c2_hi: "); dumpMsk128(c2_hi);        printf("\n");
+    DEBUG_PRINTF("    t2: "); dumpMsk128(t2);           printf("\n");
+#endif
+
+    u32 z = movemask128(eq128(t2, ones));
+    DEBUG_PRINTF("    z: 0x%08x\n", z);
+    return firstMatch(buf, z);
+}
+
+const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
+                           m128 mask2_lo, m128 mask2_hi,
+                           const u8 *buf, const u8 *buf_end) {
+    const m128 ones = ones128();
+    const m128 low4bits = _mm_set1_epi8(0xf);
+    const u8 *rv;
+
+    size_t min = (size_t)buf % 16;
+
+    // Preconditioning: most of the time our buffer won't be aligned.
+    m128 chars = loadu128(buf);
+    rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi,
+                   chars, buf, low4bits, ones);
+    if (rv) {
+        return rv;
+    }
+    buf += (16 - min);
+
+    // Unrolling was here, but it wasn't doing anything but taking up space.
+    // Reroll FTW.
+
+    const u8 *last_block = buf_end - 16;
+    while (buf < last_block) {
+        m128 lchars = load128(buf);
+        rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi,
+                       lchars, buf, low4bits, ones);
+        if (rv) {
+            return rv;
+        }
+        buf += 16;
+    }
+
+    // Use an unaligned load to mop up the last 16 bytes and get an accurate
+    // picture to buf_end.
+    chars = loadu128(buf_end - 16);
+    rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi,
+                   chars, buf_end - 16, low4bits, ones);
+    if (rv) {
+        return rv;
+    }
+
+    return buf_end;
+}
+
+#elif !defined(HAVE_AVX512)
+// AVX2 - 256 wide shuftis
+
+#ifdef DEBUG
+DUMP_MSK(256)
+#endif
+
+#define GET_LO_4(chars) and256(chars, low4bits)
+#define GET_HI_4(chars) rshift64_m256(andnot256(low4bits, chars), 4)
+
+static really_inline
+u32 block(m256 mask_lo, m256 mask_hi, m256 chars, const m256 low4bits,
+          const m256 compare) {
+    m256 c_lo  = pshufb_m256(mask_lo, GET_LO_4(chars));
+    m256 c_hi  = pshufb_m256(mask_hi, GET_HI_4(chars));
+    m256 t = and256(c_lo, c_hi);
+
+#ifdef DEBUG
+    DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n");
+    DEBUG_PRINTF("  char: "); dumpMsk256(chars); printf("\n");
+    DEBUG_PRINTF("  c_lo: "); dumpMsk256(c_lo); printf("\n");
+    DEBUG_PRINTF("  c_hi: "); dumpMsk256(c_hi); printf("\n");
+    DEBUG_PRINTF("     t: "); dumpMsk256(t); printf("\n");
+#endif
+
+    return movemask256(eq256(t, compare));
+}
+
+static really_inline
+const u8 *firstMatch(const u8 *buf, u32 z) {
+    DEBUG_PRINTF("z 0x%08x\n", z);
+    if (unlikely(z != 0xffffffff)) {
+        u32 pos = ctz32(~z);
+        assert(pos < 32);
+        DEBUG_PRINTF("match @ pos %u\n", pos);
+        return buf + pos;
+    } else {
+        return NULL; // no match
+    }
+}
+
+static really_inline
+const u8 *fwdBlockShort(m256 mask, m128 chars, const u8 *buf,
+                        const m256 low4bits) {
+    // do the hi and lo shuffles in the one avx register
+    m256 c = combine2x128(rshift64_m128(chars, 4), chars);
+    c = and256(c, low4bits);
+    m256 c_shuf = pshufb_m256(mask, c);
+    m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf));
+    // the upper 32-bits can't match
+    u32 z = 0xffff0000U | movemask128(eq128(t, zeroes128()));
+
+    return firstMatch(buf, z);
+}
+
+static really_inline
+const u8 *shuftiFwdShort(m128 mask_lo, m128 mask_hi, const u8 *buf,
+                         const u8 *buf_end, const m256 low4bits) {
+    // run shufti over two overlapping 16-byte unaligned reads
+    const m256 mask = combine2x128(mask_hi, mask_lo);
+    m128 chars = loadu128(buf);
+    const u8 *rv = fwdBlockShort(mask, chars, buf, low4bits);
+    if (rv) {
+        return rv;
+    }
+
+    chars = loadu128(buf_end - 16);
+    rv = fwdBlockShort(mask, chars, buf_end - 16, low4bits);
+    if (rv) {
+        return rv;
+    }
+    return buf_end;
+}
+
+static really_inline
+const u8 *fwdBlock(m256 mask_lo, m256 mask_hi, m256 chars, const u8 *buf,
+                   const m256 low4bits, const m256 zeroes) {
+    u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
+
+    return firstMatch(buf, z);
+}
+
+/* takes 128 bit masks, but operates on 256 bits of data */
+const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
+                     const u8 *buf_end) {
+    assert(buf && buf_end);
+    assert(buf < buf_end);
+    DEBUG_PRINTF("shufti %p len %zu\n", buf, buf_end - buf);
+
+    // Slow path for small cases.
+    if (buf_end - buf < 16) {
+        return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
+                             buf, buf_end);
+    }
+
+    const m256 low4bits = set32x8(0xf);
+
+    if (buf_end - buf <= 32) {
+        return shuftiFwdShort(mask_lo, mask_hi, buf, buf_end, low4bits);
+    }
+
+    const m256 zeroes = zeroes256();
+    const m256 wide_mask_lo = set2x128(mask_lo);
+    const m256 wide_mask_hi = set2x128(mask_hi);
+    const u8 *rv;
+
+    size_t min = (size_t)buf % 32;
+    assert(buf_end - buf >= 32);
+
+    // Preconditioning: most of the time our buffer won't be aligned.
+    m256 chars = loadu256(buf);
+    rv = fwdBlock(wide_mask_lo, wide_mask_hi, chars, buf, low4bits, zeroes);
+    if (rv) {
+        return rv;
+    }
+    buf += (32 - min);
+
+    // Unrolling was here, but it wasn't doing anything but taking up space.
+    // Reroll FTW.
+
+    const u8 *last_block = buf_end - 32;
+    while (buf < last_block) {
+        m256 lchars = load256(buf);
+        rv = fwdBlock(wide_mask_lo, wide_mask_hi, lchars, buf, low4bits, zeroes);
+        if (rv) {
+            return rv;
+        }
+        buf += 32;
+    }
+
+    // Use an unaligned load to mop up the last 32 bytes and get an accurate
+    // picture to buf_end.
+    assert(buf <= buf_end && buf >= buf_end - 32);
+    chars = loadu256(buf_end - 32);
+    rv = fwdBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, low4bits, zeroes);
+    if (rv) {
+        return rv;
+    }
+
+    return buf_end;
+}
+
+static really_inline
+const u8 *lastMatch(const u8 *buf, u32 z) {
+    if (unlikely(z != 0xffffffff)) {
+        u32 pos = clz32(~z);
+        DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos);
+        return buf + (31 - pos);
+    } else {
+        return NULL; // no match
+    }
+}
+
+static really_inline
+const u8 *revBlock(m256 mask_lo, m256 mask_hi, m256 chars, const u8 *buf,
+                   const m256 low4bits, const m256 zeroes) {
+    m256 c_lo  = pshufb_m256(mask_lo, GET_LO_4(chars));
+    m256 c_hi  = pshufb_m256(mask_hi, GET_HI_4(chars));
+    m256 t     = and256(c_lo, c_hi);
+
+#ifdef DEBUG
+    DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n");
+    DEBUG_PRINTF("  char: "); dumpMsk256(chars);        printf("\n");
+    DEBUG_PRINTF("  c_lo: "); dumpMsk256(c_lo);         printf("\n");
+    DEBUG_PRINTF("  c_hi: "); dumpMsk256(c_hi);         printf("\n");
+    DEBUG_PRINTF("     t: "); dumpMsk256(t);            printf("\n");
+#endif
+
+    u32 z = movemask256(eq256(t, zeroes));
+    return lastMatch(buf, z);
+}
+
+static really_inline
+const u8 *revBlockShort(m256 mask, m128 chars, const u8 *buf,
+                        const m256 low4bits) {
+    // do the hi and lo shuffles in the one avx register
+    m256 c = combine2x128(rshift64_m128(chars, 4), chars);
+    c = and256(c, low4bits);
+    m256 c_shuf = pshufb_m256(mask, c);
+    m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf));
+    // the upper 32-bits can't match
+    u32 z = 0xffff0000U | movemask128(eq128(t, zeroes128()));
+
+    return lastMatch(buf, z);
+}
+
+static really_inline
+const u8 *shuftiRevShort(m128 mask_lo, m128 mask_hi, const u8 *buf,
+                         const u8 *buf_end, const m256 low4bits) {
+    // run shufti over two overlapping 16-byte unaligned reads
+    const m256 mask = combine2x128(mask_hi, mask_lo);
+
+    m128 chars = loadu128(buf_end - 16);
+    const u8 *rv = revBlockShort(mask, chars, buf_end - 16, low4bits);
+    if (rv) {
+        return rv;
+    }
+
+    chars = loadu128(buf);
+    rv = revBlockShort(mask, chars, buf, low4bits);
+    if (rv) {
+        return rv;
+    }
+    return buf - 1;
+}
+
+
+/* takes 128 bit masks, but operates on 256 bits of data */
+const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
+                      const u8 *buf_end) {
+    assert(buf && buf_end);
+    assert(buf < buf_end);
+
+    // Slow path for small cases.
+    if (buf_end - buf < 16) {
+        return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
+                             buf, buf_end);
+    }
+
+    const m256 low4bits = set32x8(0xf);
+
+    if (buf_end - buf <= 32) {
+        return shuftiRevShort(mask_lo, mask_hi, buf, buf_end, low4bits);
+    }
+
+    const m256 zeroes = zeroes256();
+    const m256 wide_mask_lo = set2x128(mask_lo);
+    const m256 wide_mask_hi = set2x128(mask_hi);
+    const u8 *rv;
+
+    assert(buf_end - buf >= 32);
+
+    // Preconditioning: most of the time our buffer won't be aligned.
+    m256 chars = loadu256(buf_end - 32);
+    rv = revBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, low4bits, zeroes);
+    if (rv) {
+        return rv;
+    }
+    buf_end = (const u8 *)((size_t)buf_end & ~((size_t)0x1f));
+
+    // Unrolling was here, but it wasn't doing anything but taking up space.
+    // Reroll FTW.
+    const u8 *last_block = buf + 32;
+    while (buf_end > last_block) {
+        buf_end -= 32;
+        m256 lchars = load256(buf_end);
+        rv = revBlock(wide_mask_lo, wide_mask_hi, lchars, buf_end, low4bits, zeroes);
+        if (rv) {
+            return rv;
+        }
+    }
+
+    // Use an unaligned load to mop up the last 32 bytes and get an accurate
+    // picture to buf.
+    chars = loadu256(buf);
+    rv = revBlock(wide_mask_lo, wide_mask_hi, chars, buf, low4bits, zeroes);
+    if (rv) {
+        return rv;
+    }
+
+    return buf - 1;
+}
+
+static really_inline
+const u8 *fwdBlock2(m256 mask1_lo, m256 mask1_hi, m256 mask2_lo, m256 mask2_hi,
+                    m256 chars, const u8 *buf, const m256 low4bits,
+                    const m256 ones) {
+    DEBUG_PRINTF("buf %p\n", buf);
+    m256 chars_lo = GET_LO_4(chars);
+    m256 chars_hi = GET_HI_4(chars);
+    m256 c_lo  = pshufb_m256(mask1_lo, chars_lo);
+    m256 c_hi  = pshufb_m256(mask1_hi, chars_hi);
+    m256 t     = or256(c_lo, c_hi);
+
+#ifdef DEBUG
+    DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n");
+    DEBUG_PRINTF("  char: "); dumpMsk256(chars);        printf("\n");
+    DEBUG_PRINTF("  c_lo: "); dumpMsk256(c_lo);         printf("\n");
+    DEBUG_PRINTF("  c_hi: "); dumpMsk256(c_hi);         printf("\n");
+    DEBUG_PRINTF("     t: "); dumpMsk256(t);            printf("\n");
+#endif
+
+    m256 c2_lo  = pshufb_m256(mask2_lo, chars_lo);
+    m256 c2_hi  = pshufb_m256(mask2_hi, chars_hi);
+    m256 t2 = or256(t, rshift128_m256(or256(c2_lo, c2_hi), 1));
+
+#ifdef DEBUG
+    DEBUG_PRINTF(" c2_lo: "); dumpMsk256(c2_lo);        printf("\n");
+    DEBUG_PRINTF(" c2_hi: "); dumpMsk256(c2_hi);        printf("\n");
+    DEBUG_PRINTF("    t2: "); dumpMsk256(t2);           printf("\n");
+#endif
+    u32 z = movemask256(eq256(t2, ones));
+
+    return firstMatch(buf, z);
+}
+
+static really_inline
+const u8 *fwdBlockShort2(m256 mask1, m256 mask2, m128 chars, const u8 *buf,
+                         const m256 low4bits) {
+    // do the hi and lo shuffles in the one avx register
+    m256 c = combine2x128(rshift64_m128(chars, 4), chars);
+    c = and256(c, low4bits);
+    m256 c_shuf1 = pshufb_m256(mask1, c);
+    m256 c_shuf2 = rshift128_m256(pshufb_m256(mask2, c), 1);
+    m256 t0 = or256(c_shuf1, c_shuf2);
+    m128 t = or128(movdq_hi(t0), cast256to128(t0));
+    // the upper 32-bits can't match
+    u32 z = 0xffff0000U | movemask128(eq128(t, ones128()));
+
+    return firstMatch(buf, z);
+}
+
+static really_inline
+const u8 *shuftiDoubleShort(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo,
+                            m128 mask2_hi, const u8 *buf, const u8 *buf_end) {
+    DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf);
+    const m256 low4bits = set32x8(0xf);
+    // run shufti over two overlapping 16-byte unaligned reads
+    const m256 mask1 = combine2x128(mask1_hi, mask1_lo);
+    const m256 mask2 = combine2x128(mask2_hi, mask2_lo);
+    m128 chars = loadu128(buf);
+    const u8 *rv = fwdBlockShort2(mask1, mask2, chars, buf, low4bits);
+    if (rv) {
+        return rv;
+    }
+
+    chars = loadu128(buf_end - 16);
+    rv = fwdBlockShort2(mask1, mask2, chars, buf_end - 16, low4bits);
+    if (rv) {
+        return rv;
+    }
+    return buf_end;
+}
+
+/* takes 128 bit masks, but operates on 256 bits of data */
+const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
+                           m128 mask2_lo, m128 mask2_hi,
+                           const u8 *buf, const u8 *buf_end) {
+    /* we should always have at least 16 bytes */
+    assert(buf_end - buf >= 16);
+    DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf);
+
+    if (buf_end - buf < 32) {
+        return shuftiDoubleShort(mask1_lo, mask1_hi, mask2_lo, mask2_hi, buf,
+                                 buf_end);
+    }
+
+    const m256 ones = ones256();
+    const m256 low4bits = set32x8(0xf);
+    const m256 wide_mask1_lo = set2x128(mask1_lo);
+    const m256 wide_mask1_hi = set2x128(mask1_hi);
+    const m256 wide_mask2_lo = set2x128(mask2_lo);
+    const m256 wide_mask2_hi = set2x128(mask2_hi);
+    const u8 *rv;
+
+    size_t min = (size_t)buf % 32;
+
+    // Preconditioning: most of the time our buffer won't be aligned.
+    m256 chars = loadu256(buf);
+    rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi,
+                   chars, buf, low4bits, ones);
+    if (rv) {
+        return rv;
+    }
+    buf += (32 - min);
+
+    // Unrolling was here, but it wasn't doing anything but taking up space.
+    // Reroll FTW.
+    const u8 *last_block = buf_end - 32;
+    while (buf < last_block) {
+        m256 lchars = load256(buf);
+        rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi,
+                       lchars, buf, low4bits, ones);
+        if (rv) {
+            return rv;
+        }
+        buf += 32;
+    }
+
+    // Use an unaligned load to mop up the last 32 bytes and get an accurate
+    // picture to buf_end.
+    chars = loadu256(buf_end - 32);
+    rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi,
+                   chars, buf_end - 32, low4bits, ones);
+    if (rv) {
+        return rv;
+    }
+
+    return buf_end;
+}
+
+#else // defined(HAVE_AVX512)
+
+#ifdef DEBUG
+DUMP_MSK(512)
+#endif
+
+static really_inline
+u64a block(m512 mask_lo, m512 mask_hi, m512 chars, const m512 low4bits,
+           const m512 compare) {
+    m512 c_lo = pshufb_m512(mask_lo, and512(chars, low4bits));
+    m512 c_hi = pshufb_m512(mask_hi,
+                            rshift64_m512(andnot512(low4bits, chars), 4));
+    m512 t = and512(c_lo, c_hi);
+
+#ifdef DEBUG
+    DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n");
+    DEBUG_PRINTF("  char: "); dumpMsk512(chars); printf("\n");
+    DEBUG_PRINTF("  c_lo: "); dumpMsk512(c_lo); printf("\n");
+    DEBUG_PRINTF("  c_hi: "); dumpMsk512(c_hi); printf("\n");
+    DEBUG_PRINTF("     t: "); dumpMsk512(t); printf("\n");
+#endif
+
+    return eq512mask(t, compare);
+}
+static really_inline
+const u8 *firstMatch64(const u8 *buf, u64a z) {
+    DEBUG_PRINTF("z 0x%016llx\n", z);
+    if (unlikely(z != ~0ULL)) {
+        u32 pos = ctz64(~z);
+        DEBUG_PRINTF("match @ pos %u\n", pos);
+        assert(pos < 64);
+        return buf + pos;
+    } else {
+        return NULL; // no match
+    }
+}
+
+static really_inline
+const u8 *fwdBlock512(m512 mask_lo, m512 mask_hi, m512 chars, const u8 *buf,
+                      const m512 low4bits, const m512 zeroes) {
+    u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
+
+    return firstMatch64(buf, z);
+}
+
+static really_inline
+const u8 *shortShufti512(m512 mask_lo, m512 mask_hi, const u8 *buf,
+                         const u8 *buf_end, const m512 low4bits,
+                         const m512 zeroes) {
+    DEBUG_PRINTF("short shufti %p len %zu\n", buf, buf_end - buf);
+    uintptr_t len = buf_end - buf;
+    assert(len <= 64);
+
+    // load mask
+    u64a k = (~0ULL) >> (64 - len);
+    DEBUG_PRINTF("load mask 0x%016llx\n", k);
+
+    m512 chars = loadu_maskz_m512(k, buf);
+
+    u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
+
+    // reuse the load mask to indicate valid bytes
+    return firstMatch64(buf, z | ~k);
+}
+
+/* takes 128 bit masks, but operates on 512 bits of data */
+const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
+                     const u8 *buf_end) {
+    assert(buf && buf_end);
+    assert(buf < buf_end);
+    DEBUG_PRINTF("shufti %p len %zu\n", buf, buf_end - buf);
+    DEBUG_PRINTF("b %s\n", buf);
+
+    const m512 low4bits = set64x8(0xf);
+    const m512 zeroes = zeroes512();
+    const m512 wide_mask_lo = set4x128(mask_lo);
+    const m512 wide_mask_hi = set4x128(mask_hi);
+    const u8 *rv;
+
+    // small cases.
+    if (buf_end - buf <= 64) {
+        rv = shortShufti512(wide_mask_lo, wide_mask_hi, buf, buf_end, low4bits,
+                            zeroes);
+        return rv ? rv : buf_end;
+    }
+
+    assert(buf_end - buf >= 64);
+
+    // Preconditioning: most of the time our buffer won't be aligned.
+    if ((uintptr_t)buf % 64) {
+        rv = shortShufti512(wide_mask_lo, wide_mask_hi, buf,
+                            ROUNDUP_PTR(buf, 64), low4bits, zeroes);
+        if (rv) {
+            return rv;
+        }
+        buf = ROUNDUP_PTR(buf, 64);
+    }
+
+    const u8 *last_block = ROUNDDOWN_PTR(buf_end, 64);
+    while (buf < last_block) {
+        m512 lchars = load512(buf);
+        rv = fwdBlock512(wide_mask_lo, wide_mask_hi, lchars, buf, low4bits,
+                         zeroes);
+        if (rv) {
+            return rv;
+        }
+        buf += 64;
+    }
+
+    if (buf == buf_end) {
+        goto done;
+    }
+
+    // Use an unaligned load to mop up the last 64 bytes and get an accurate
+    // picture to buf_end.
+    assert(buf <= buf_end && buf >= buf_end - 64);
+    m512 chars = loadu512(buf_end - 64);
+    rv = fwdBlock512(wide_mask_lo, wide_mask_hi, chars, buf_end - 64, low4bits,
+                     zeroes);
+    if (rv) {
+        return rv;
+    }
+done:
+    return buf_end;
+}
+
+static really_inline
+const u8 *lastMatch64(const u8 *buf, u64a z) {
+    DEBUG_PRINTF("z 0x%016llx\n", z);
+    if (unlikely(z != ~0ULL)) {
+        u32 pos = clz64(~z);
+        DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos);
+        return buf + (63 - pos);
+    } else {
+        return NULL; // no match
+    }
+}
+
+static really_inline
+const u8 *rshortShufti512(m512 mask_lo, m512 mask_hi, const u8 *buf,
+                          const u8 *buf_end, const m512 low4bits,
+                          const m512 zeroes) {
+    DEBUG_PRINTF("short %p len %zu\n", buf, buf_end - buf);
+    uintptr_t len = buf_end - buf;
+    assert(len <= 64);
+
+    // load mask
+    u64a k = (~0ULL) >> (64 - len);
+    DEBUG_PRINTF("load mask 0x%016llx\n", k);
+
+    m512 chars = loadu_maskz_m512(k, buf);
+
+    u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
+
+    // reuse the load mask to indicate valid bytes
+    return lastMatch64(buf, z | ~k);
+}
+
+static really_inline
+const u8 *revBlock512(m512 mask_lo, m512 mask_hi, m512 chars, const u8 *buf,
+                      const m512 low4bits, const m512 zeroes) {
+    m512 c_lo  = pshufb_m512(mask_lo, and512(chars, low4bits));
+    m512 c_hi  = pshufb_m512(mask_hi,
+                             rshift64_m512(andnot512(low4bits, chars), 4));
+    m512 t     = and512(c_lo, c_hi);
+
+#ifdef DEBUG
+    DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n");
+    DEBUG_PRINTF("  char: "); dumpMsk512(chars);        printf("\n");
+    DEBUG_PRINTF("  c_lo: "); dumpMsk512(c_lo);         printf("\n");
+    DEBUG_PRINTF("  c_hi: "); dumpMsk512(c_hi);         printf("\n");
+    DEBUG_PRINTF("     t: "); dumpMsk512(t);            printf("\n");
+#endif
+
+    u64a z = eq512mask(t, zeroes);
+    return lastMatch64(buf, z);
+}
+
+/* takes 128 bit masks, but operates on 512 bits of data */
+const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
+                      const u8 *buf_end) {
+    DEBUG_PRINTF("buf %p buf_end %p\n", buf, buf_end);
+    assert(buf && buf_end);
+    assert(buf < buf_end);
+
+    const m512 low4bits = set64x8(0xf);
+    const m512 zeroes = zeroes512();
+    const m512 wide_mask_lo = set4x128(mask_lo);
+    const m512 wide_mask_hi = set4x128(mask_hi);
+    const u8 *rv;
+
+    if (buf_end - buf < 64) {
+        rv = rshortShufti512(wide_mask_lo, wide_mask_hi, buf, buf_end, low4bits,
+                             zeroes);
+        return rv ? rv : buf - 1;
+    }
+
+    if (ROUNDDOWN_PTR(buf_end, 64) != buf_end) {
+        // peel off unaligned portion
+        assert(buf_end - buf >= 64);
+        DEBUG_PRINTF("start\n");
+        rv = rshortShufti512(wide_mask_lo, wide_mask_hi,
+                             ROUNDDOWN_PTR(buf_end, 64), buf_end, low4bits,
+                             zeroes);
+        if (rv) {
+            return rv;
+        }
+        buf_end = ROUNDDOWN_PTR(buf_end, 64);
+    }
+
+    const u8 *last_block = ROUNDUP_PTR(buf, 64);
+    while (buf_end > last_block) {
+        buf_end -= 64;
+        m512 lchars = load512(buf_end);
+        rv = revBlock512(wide_mask_lo, wide_mask_hi, lchars, buf_end, low4bits,
+                         zeroes);
+        if (rv) {
+            return rv;
+        }
+    }
+    if (buf_end == buf) {
+        goto done;
+    }
+    // Use an unaligned load to mop up the last 64 bytes and get an accurate
+    // picture to buf.
+    m512 chars = loadu512(buf);
+    rv = revBlock512(wide_mask_lo, wide_mask_hi, chars, buf, low4bits, zeroes);
+    if (rv) {
+        return rv;
+    }
+done:
+    return buf - 1;
+}
+
+static really_inline
+const u8 *fwdBlock2(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo, m512 mask2_hi,
+                    m512 chars, const u8 *buf, const m512 low4bits,
+                    const m512 ones, __mmask64 k) {
+    DEBUG_PRINTF("buf %p %.64s\n", buf, buf);
+    m512 chars_lo = and512(chars, low4bits);
+    m512 chars_hi = rshift64_m512(andnot512(low4bits, chars), 4);
+    m512 c_lo  = maskz_pshufb_m512(k, mask1_lo, chars_lo);
+    m512 c_hi  = maskz_pshufb_m512(k, mask1_hi, chars_hi);
+    m512 t     = or512(c_lo, c_hi);
+
+#ifdef DEBUG
+    DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n");
+    DEBUG_PRINTF("  char: "); dumpMsk512(chars);        printf("\n");
+    DEBUG_PRINTF("  c_lo: "); dumpMsk512(c_lo);         printf("\n");
+    DEBUG_PRINTF("  c_hi: "); dumpMsk512(c_hi);         printf("\n");
+    DEBUG_PRINTF("     t: "); dumpMsk512(t);            printf("\n");
+#endif
+
+    m512 c2_lo  = maskz_pshufb_m512(k, mask2_lo, chars_lo);
+    m512 c2_hi  = maskz_pshufb_m512(k, mask2_hi, chars_hi);
+    m512 t2 = or512(t, rshift128_m512(or512(c2_lo, c2_hi), 1));
+
+#ifdef DEBUG
+    DEBUG_PRINTF(" c2_lo: "); dumpMsk512(c2_lo);        printf("\n");
+    DEBUG_PRINTF(" c2_hi: "); dumpMsk512(c2_hi);        printf("\n");
+    DEBUG_PRINTF("    t2: "); dumpMsk512(t2);           printf("\n");
+#endif
+    u64a z = eq512mask(t2, ones);
+
+    return firstMatch64(buf, z | ~k);
+}
+
+static really_inline
+const u8 *shortDoubleShufti512(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo,
+                               m512 mask2_hi, const u8 *buf, const u8 *buf_end,
+                               const m512 low4bits, const m512 ones) {
+    DEBUG_PRINTF("short %p len %zu\n", buf, buf_end - buf);
+    uintptr_t len = buf_end - buf;
+    assert(len <= 64);
+
+    u64a k = (~0ULL) >> (64 - len);
+    DEBUG_PRINTF("load mask 0x%016llx\n", k);
+
+    m512 chars = loadu_mask_m512(ones, k, buf);
+
+    const u8 *rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, chars, buf,
+                             low4bits, ones, k);
+
+    return rv;
+}
+
+/* takes 128 bit masks, but operates on 512 bits of data */
+const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
+                           m128 mask2_lo, m128 mask2_hi,
+                           const u8 *buf, const u8 *buf_end) {
+    /* we should always have at least 16 bytes */
+    assert(buf_end - buf >= 16);
+    DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf);
+
+    const m512 ones = ones512();
+    const m512 low4bits = set64x8(0xf);
+    const m512 wide_mask1_lo = set4x128(mask1_lo);
+    const m512 wide_mask1_hi = set4x128(mask1_hi);
+    const m512 wide_mask2_lo = set4x128(mask2_lo);
+    const m512 wide_mask2_hi = set4x128(mask2_hi);
+    const u8 *rv;
+
+    if (buf_end - buf <= 64) {
+        rv = shortDoubleShufti512(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo,
+                                  wide_mask2_hi, buf, buf_end, low4bits, ones);
+        DEBUG_PRINTF("rv %p\n", rv);
+        return rv ? rv : buf_end;
+    }
+
+    // Preconditioning: most of the time our buffer won't be aligned.
+    if ((uintptr_t)buf % 64) {
+        rv = shortDoubleShufti512(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo,
+                                  wide_mask2_hi, buf, ROUNDUP_PTR(buf, 64),
+                                  low4bits, ones);
+        if (rv) {
+            return rv;
+        }
+
+        buf = ROUNDUP_PTR(buf, 64);
+    }
+
+    const u8 *last_block = buf_end - 64;
+    while (buf < last_block) {
+        m512 lchars = load512(buf);
+        rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo,
+                       wide_mask2_hi, lchars, buf, low4bits, ones, ~0);
+        if (rv) {
+            return rv;
+        }
+        buf += 64;
+    }
+
+    // Use an unaligned load to mop up the last 64 bytes and get an accurate
+    // picture to buf_end.
+    m512 chars = loadu512(buf_end - 64);
+    rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi,
+                   chars, buf_end - 64, low4bits, ones, ~0);
+    if (rv) {
+        return rv;
+    }
+
+    return buf_end;
+}
+#endif
diff --git a/regex/nfa/shufti.h b/regex/nfa/shufti.h
new file mode 100644
index 000000000..1ebf776cc
--- /dev/null
+++ b/regex/nfa/shufti.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Shufti: character class acceleration.
+ *
+ * Utilises the SSSE3 pshufb shuffle instruction
+ */
+
+#ifndef SHUFTI_H
+#define SHUFTI_H
+
+#include "ue2common.h"
+#include "util/simd_utils.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
+                     const u8 *buf_end);
+
+// Returns (buf - 1) if not found.
+const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
+                      const u8 *buf_end);
+
+const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
+                           m128 mask2_lo, m128 mask2_hi,
+                           const u8 *buf, const u8 *buf_end);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/regex/nfa/tamarama.c b/regex/nfa/tamarama.c
new file mode 100644
index 000000000..43480f065
--- /dev/null
+++ b/regex/nfa/tamarama.c
@@ -0,0 +1,441 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+    \brief Tamarama: container engine for exclusive engines, runtime code.
+*/
+#include "config.h"
+
+#include "tamarama.h"
+
+#include "tamarama_internal.h"
+#include "nfa_api.h"
+#include "nfa_api_queue.h"
+#include "nfa_api_util.h"
+#include "nfa_internal.h"
+#include "scratch.h"
+#include "util/partial_store.h"
+
+static really_inline
+u32 getSubOffset(const struct Tamarama *t, u32 num) {
+    DEBUG_PRINTF("subengine:%u\n", num);
+    assert(num < t->numSubEngines);
+    const u32 *sub =
+        (const u32 *)((const char *)t + sizeof(struct Tamarama) +
+                      t->numSubEngines * sizeof(u32));
+    assert(ISALIGNED(sub));
+    return sub[num];
+}
+
+static
+const struct NFA *getSubEngine(const struct Tamarama *t,
+                               const u32 activeIdx) {
+    const u32 offset = getSubOffset(t, activeIdx);
+    DEBUG_PRINTF("activeIdx:%u offsets:%u\n", activeIdx, offset);
+    const char *base = (const char *)t;
+    return (const struct NFA *)(base + offset);
+}
+
+static
+void storeActiveIdx(const struct Tamarama *t, char *state,
+                    const u32 idx) {
+    assert(idx <= t->numSubEngines);
+    partial_store_u32(state, idx, t->activeIdxSize);
+}
+
+static
+u32 loadActiveIdx(const char *state,
+                  const u32 activeIdxSize) {
+    return partial_load_u32(state, activeIdxSize);
+}
+
+static really_inline
+void copyQueueProperties(const struct mq *q1, struct mq *q2,
+                         const u32 activeIdxSize) {
+    q2->state = q1->state;
+    q2->streamState = q1->streamState + activeIdxSize;
+    q2->offset = q1->offset;
+    q2->buffer = q1->buffer;
+    q2->length = q1->length;
+    q2->history = q1->history;
+    q2->hlength = q1->hlength;
+    q2->cb = q1->cb;
+    q2->context = q1->context;
+    q2->scratch = q1->scratch;
+    q2->report_current = q1->report_current;
+}
+
+static
+void copyQueueItems(const struct Tamarama *t, const struct NFA *sub,
+                    struct mq *q1, struct mq *q2, const u32 activeIdx) {
+    const u32 *baseTop = (const u32 *)((const char *)t +
+                                       sizeof(struct Tamarama));
+
+    u32 lower = baseTop[activeIdx];
+    u32 upper = activeIdx == t->numSubEngines - 1 ?
+                    ~0U : baseTop[activeIdx + 1];
+    u32 event_base = isMultiTopType(sub->type) ? MQE_TOP_FIRST : MQE_TOP;
+    while (q1->cur < q1->end) {
+        u32 type = q1->items[q1->cur].type;
+        s64a loc = q1->items[q1->cur].location;
+        DEBUG_PRINTF("type:%u lower:%u upper:%u\n", type, lower, upper);
+        if (type >= lower && type < upper) {
+            u32 event = event_base;
+            if (event == MQE_TOP_FIRST) {
+                event += type - lower;
+            }
+            pushQueue(q2, event, loc);
+        } else {
+            pushQueueNoMerge(q2, MQE_END, loc);
+            break;
+        }
+        q1->cur++;
+    }
+}
+
+static
+void copyQueue(const struct Tamarama *t, const struct NFA *sub,
+               struct mq *q1, struct mq *q2, const u32 activeIdx) {
+    copyQueueProperties(q1, q2, t->activeIdxSize);
+
+    // copy MQE_START item
+    u32 cur = q1->cur++;
+    q2->cur = cur;
+    q2->items[cur] = q1->items[cur];
+    q2->end = cur + 1;
+
+    copyQueueItems(t, sub, q1, q2, activeIdx);
+    // restore cur index of the main queue
+    q1->cur = cur;
+}
+
+static
+u32 findEngineForTop(const u32 *baseTop, const u32 cur,
+                     const u32 numSubEngines) {
+    u32 i;
+    for (i = 0; i < numSubEngines; ++i) {
+        DEBUG_PRINTF("cur:%u base:%u\n", cur, baseTop[i]);
+        if (cur >= baseTop[i] &&
+            (i == numSubEngines - 1 || cur < baseTop[i + 1])) {
+            break;
+        }
+    }
+    return i;
+}
+
+static
+void initSubQueue(const struct Tamarama *t, struct mq *q1,
+                  struct mq *q2, const u32 lastActiveIdx,
+                  const u32 activeIdx) {
+    // Push events to the new queue
+    const struct NFA *sub = getSubEngine(t, activeIdx);
+    assert(!isContainerType(sub->type));
+    q2->nfa = sub;
+
+    // Reinitialize state if the last active subengine is different
+    // from current one
+    if (lastActiveIdx == t->numSubEngines ||
+        lastActiveIdx != activeIdx) {
+        nfaQueueInitState(q2->nfa, q2);
+    }
+
+    copyQueueItems(t, sub, q1, q2, activeIdx);
+    if (q1->items[q1->cur].type == MQE_END) {
+        q1->cur++;
+    }
+    DEBUG_PRINTF("update lastIdx:%u\n", activeIdx);
+    storeActiveIdx(t, q1->streamState, activeIdx);
+}
+
+static
+void updateQueues(const struct Tamarama *t, struct mq *q1, struct mq *q2) {
+    q2->cur = q2->end = 0;
+    copyQueueProperties(q1, q2, t->activeIdxSize);
+
+    const u32 numSubEngines = t->numSubEngines;
+    u32 lastActiveIdx = loadActiveIdx(q1->streamState,
+                                      t->activeIdxSize);
+#ifdef DEBUG
+    DEBUG_PRINTF("external queue\n");
+    debugQueue(q1);
+#endif
+
+    // Push MQE_START event to the subqueue
+    s64a loc = q1->items[q1->cur].location;
+    pushQueueAt(q2, 0, MQE_START, loc);
+    char hasStart = 0;
+    if (q1->items[q1->cur].type == MQE_START) {
+        hasStart = 1;
+        q1->cur++;
+    }
+
+    u32 activeIdx = lastActiveIdx;
+    // If we have top events in the main queue, update current active id
+    if (q1->cur < q1->end - 1) {
+        const u32 *baseTop = (const u32 *)((const char *)t +
+                                           sizeof(struct Tamarama));
+        u32 curTop = q1->items[q1->cur].type;
+        activeIdx = findEngineForTop(baseTop, curTop, numSubEngines);
+    }
+
+    assert(activeIdx < numSubEngines);
+    DEBUG_PRINTF("last id:%u, current id:%u, num of subengines:%u\n",
+                 lastActiveIdx, activeIdx, numSubEngines);
+    // Handle unfinished last alive subengine
+    if (lastActiveIdx != activeIdx &&
+        lastActiveIdx != numSubEngines && hasStart) {
+        loc = q1->items[q1->cur].location;
+        pushQueueNoMerge(q2, MQE_END, loc);
+        q2->nfa = getSubEngine(t, lastActiveIdx);
+        return;
+    }
+
+    initSubQueue(t, q1, q2, lastActiveIdx, activeIdx);
+    DEBUG_PRINTF("finish queues\n");
+}
+
+// After processing subqueue items for subengines, we need to copy back
+// remaining items in subqueue if there are any to Tamarama main queue
+static
+void copyBack(const struct  Tamarama *t, struct mq *q, struct mq *q1) {
+    DEBUG_PRINTF("copy back %u, %u\n", q1->cur, q1->end);
+    q->report_current = q1->report_current;
+    if (q->cur >= q->end && q1->cur >= q1->end) {
+        return;
+    }
+
+    const u32 *baseTop = (const u32 *)((const char *)t +
+                                        sizeof(struct Tamarama));
+    const u32 lastIdx = loadActiveIdx(q->streamState,
+                                      t->activeIdxSize);
+    u32 base = 0, event_base = 0;
+    if (lastIdx != t->numSubEngines) {
+        base = baseTop[lastIdx];
+        const struct NFA *sub = getSubEngine(t, lastIdx);
+        event_base = isMultiTopType(sub->type) ? MQE_TOP_FIRST : MQE_TOP;
+    }
+
+    u32 numItems = q1->end > q1->cur + 1 ? q1->end - q1->cur - 1 : 1;
+    // Also need to copy MQE_END if the main queue is empty
+    if (q->cur == q->end) {
+        assert(q->cur > 1 && q1->items[q1->end - 1].type == MQE_END);
+        q->items[--q->cur] = q1->items[q1->end - 1];
+    }
+    u32 cur = q->cur - numItems;
+    q->items[cur] = q1->items[q1->cur++];
+    q->items[cur].type = MQE_START;
+    q->cur = cur++;
+    for (u32 i = 0; i < numItems - 1; ++i) {
+        assert(q1->cur < q1->end);
+        u32 type = q1->items[q1->cur].type;
+        if (type > MQE_END) {
+            q1->items[q1->cur].type = type - event_base + base;
+        }
+        q->items[cur++] = q1->items[q1->cur++];
+    }
+
+#ifdef DEBUG
+    DEBUG_PRINTF("external queue\n");
+    debugQueue(q);
+#endif
+}
+
+char nfaExecTamarama_testEOD(const struct NFA *n, const char *state,
+                             const char *streamState, u64a offset,
+                             NfaCallback callback, void *context) {
+    const struct Tamarama *t = getImplNfa(n);
+    u32 activeIdx = loadActiveIdx(streamState, t->activeIdxSize);
+    if (activeIdx == t->numSubEngines) {
+        return MO_CONTINUE_MATCHING;
+    }
+
+    const struct NFA *sub = getSubEngine(t, activeIdx);
+    if (nfaAcceptsEod(sub)) {
+        assert(!isContainerType(sub->type));
+        const char *subStreamState = streamState + t->activeIdxSize;
+        return nfaCheckFinalState(sub, state, subStreamState, offset, callback,
+                                  context);
+    }
+
+    return MO_CONTINUE_MATCHING;
+}
+
+char nfaExecTamarama_QR(const struct NFA *n, struct mq *q, ReportID report) {
+    DEBUG_PRINTF("exec rose\n");
+    struct mq q1;
+    q1.cur = q1.end = 0;
+    char rv = 0;
+    const struct Tamarama *t = getImplNfa(n);
+    while (q->cur < q->end) {
+        updateQueues(t, q, &q1);
+    }
+
+    if (q1.cur < q1.end) {
+        rv = nfaQueueExecRose(q1.nfa, &q1, report);
+    }
+
+    DEBUG_PRINTF("exec rose rv:%u\n", rv);
+    return rv;
+}
+
+char nfaExecTamarama_reportCurrent(const struct NFA *n, struct mq *q) {
+    const struct Tamarama *t = getImplNfa(n);
+    u32 activeIdx = loadActiveIdx(q->streamState, t->activeIdxSize);
+    if (activeIdx == t->numSubEngines) {
+        return 1;
+    }
+
+    const struct NFA *sub = getSubEngine(t, activeIdx);
+    struct mq q1;
+    copyQueue(t, sub, q, &q1, activeIdx);
+    return nfaReportCurrentMatches(sub, &q1);
+}
+
+char nfaExecTamarama_inAccept(const struct NFA *n, ReportID report,
+                              struct mq *q) {
+    const struct Tamarama *t = getImplNfa(n);
+    u32 activeIdx = loadActiveIdx(q->streamState, t->activeIdxSize);
+    if (activeIdx == t->numSubEngines) {
+        return 0;
+    }
+    const struct NFA *sub = getSubEngine(t, activeIdx);
+
+    struct mq q1;
+    copyQueue(t, sub, q, &q1, activeIdx);
+    return nfaInAcceptState(sub, report, &q1);
+}
+
+char nfaExecTamarama_inAnyAccept(const struct NFA *n, struct mq *q) {
+    const struct Tamarama *t = getImplNfa(n);
+    u32 activeIdx = loadActiveIdx(q->streamState, t->activeIdxSize);
+    if (activeIdx == t->numSubEngines) {
+        return 0;
+    }
+    const struct NFA *sub = getSubEngine(t, activeIdx);
+
+    struct mq q1;
+    copyQueue(t, sub, q, &q1, activeIdx);
+    return nfaInAnyAcceptState(sub, &q1);
+}
+
+char nfaExecTamarama_queueInitState(const struct NFA *n, struct mq *q) {
+    DEBUG_PRINTF("init state\n");
+    const struct Tamarama *t = getImplNfa(n);
+    char *ptr = q->streamState;
+    // Use activeIdxSize as a sentinel value and initialize the state to
+    // an invalid engine as nothing has been triggered yet
+    storeActiveIdx(t, ptr, t->numSubEngines);
+    return 0;
+}
+
+char nfaExecTamarama_queueCompressState(const struct NFA *n, const struct mq *q,
+                                        s64a loc) {
+    const struct Tamarama *t = getImplNfa(n);
+    u32 activeIdx = loadActiveIdx(q->streamState, t->activeIdxSize);
+    if (activeIdx == t->numSubEngines) {
+        return 0;
+    }
+
+    const struct NFA *sub = getSubEngine(t, activeIdx);
+
+    struct mq q1;
+    copyQueueProperties(q, &q1, t->activeIdxSize);
+    return nfaQueueCompressState(sub, &q1, loc);
+}
+
+char nfaExecTamarama_expandState(const struct NFA *n, void *dest,
+                                 const void *src, u64a offset, u8 key) {
+    const struct Tamarama *t = getImplNfa(n);
+    u32 activeIdx = loadActiveIdx(src, t->activeIdxSize);
+    if (activeIdx == t->numSubEngines) {
+        return 0;
+    }
+
+    const struct NFA *sub = getSubEngine(t, activeIdx);
+
+    const char *subStreamState = (const char *)src + t->activeIdxSize;
+    return nfaExpandState(sub, dest, subStreamState, offset, key);
+}
+
+enum nfa_zombie_status nfaExecTamarama_zombie_status(const struct NFA *n,
+                                                     struct mq *q, s64a loc) {
+    const struct Tamarama *t = getImplNfa(n);
+    u32 activeIdx = loadActiveIdx(q->streamState, t->activeIdxSize);
+    if (activeIdx == t->numSubEngines) {
+        return NFA_ZOMBIE_NO;
+    }
+    const struct NFA *sub = getSubEngine(t, activeIdx);
+
+    struct mq q1;
+    copyQueue(t, sub, q, &q1, activeIdx);
+    return nfaGetZombieStatus(sub, &q1, loc);
+}
+
+char nfaExecTamarama_Q(const struct NFA *n, struct mq *q, s64a end) {
+    DEBUG_PRINTF("exec\n");
+    struct mq q1;
+    char rv = MO_ALIVE;
+    char copy = 0;
+    const struct Tamarama *t = getImplNfa(n);
+    while (q->cur < q->end && q_cur_loc(q) <= end) {
+        updateQueues(t, q, &q1);
+        rv = nfaQueueExec_raw(q1.nfa, &q1, end);
+        q->report_current = q1.report_current;
+        copy = 1;
+        if (can_stop_matching(q->scratch)) {
+            break;
+        }
+    }
+    if (copy) {
+        copyBack(t, q, &q1);
+    }
+    return rv;
+}
+
+char nfaExecTamarama_Q2(const struct NFA *n, struct mq *q, s64a end) {
+    DEBUG_PRINTF("exec to match\n");
+    struct mq q1;
+    char rv = 0;
+    char copy = 0;
+    const struct Tamarama *t = getImplNfa(n);
+    while (q->cur < q->end && q_cur_loc(q) <= end &&
+           rv != MO_MATCHES_PENDING) {
+        updateQueues(t, q, &q1);
+        rv = nfaQueueExec2_raw(q1.nfa, &q1, end);
+        q->report_current = q1.report_current;
+        copy = 1;
+        if (can_stop_matching(q->scratch)) {
+            break;
+        }
+    }
+    if (copy) {
+        copyBack(t, q, &q1);
+    }
+    return rv;
+}
+
diff --git a/regex/nfa/tamarama.h b/regex/nfa/tamarama.h
new file mode 100644
index 000000000..3b52d8de7
--- /dev/null
+++ b/regex/nfa/tamarama.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef TAMARAMA_H
+#define TAMARAMA_H
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#include "callback.h"
+#include "ue2common.h"
+
+struct mq;
+struct NFA;
+struct hs_scratch;
+
+char nfaExecTamarama_testEOD(const struct NFA *n, const char *state,
+                             const char *streamState, u64a offset,
+                             NfaCallback callback, void *context);
+char nfaExecTamarama_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecTamarama_reportCurrent(const struct NFA *n, struct mq *q);
+char nfaExecTamarama_inAccept(const struct NFA *n, ReportID report,
+                              struct mq *q);
+char nfaExecTamarama_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecTamarama_queueInitState(const struct NFA *n, struct mq *q);
+char nfaExecTamarama_queueCompressState(const struct NFA *n, const struct mq *q,
+                                        s64a loc);
+char nfaExecTamarama_expandState(const struct NFA *n, void *dest,
+                                 const void *src, u64a offset, u8 key);
+enum nfa_zombie_status nfaExecTamarama_zombie_status(const struct NFA *n,
+                                                     struct mq *q, s64a loc);
+char nfaExecTamarama_Q(const struct NFA *nfa, struct mq *q, s64a end);
+char nfaExecTamarama_Q2(const struct NFA *nfa, struct mq *q, s64a end);
+
+// only used by outfix and miracles, no implementation for tamarama
+#define nfaExecTamarama_initCompressedState NFA_API_NO_IMPL
+#define nfaExecTamarama_B_Reverse NFA_API_NO_IMPL
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/regex/nfa/tamarama_internal.h b/regex/nfa/tamarama_internal.h
new file mode 100644
index 000000000..5cdc70d40
--- /dev/null
+++ b/regex/nfa/tamarama_internal.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ *\brief Tamarama: container engine for exclusive engines,
+ *                 data structures.
+ */
+
+/* Tamarama bytecode layout:
+ * * |-----|
+ * * |     | struct NFA
+ * * |-----|
+ * * |     | struct Tamarama
+ * * |     |
+ * * |-----|
+ * * |     | top remapping table:
+ * * |     | stores top base for each subengine.
+ * * |     | old_top = remapped_top - top_base;
+ * * |     | The size of table is equal to the number of subengines.
+ * * ...
+ * * |     |
+ * * |-----|
+ * * |     | offsets from the start of struct Tamarama to subengines --\
+ * * ...                                                               |
+ * * |     |                                          -----------\     |
+ * * |-----|                                                     |     |
+ * * ||--| | subengine 1 (struct NFA + rest of subengine)     <--/     |
+ * * ||  | |                                                           |
+ * * ||--| |                                                           |
+ * * ||  | |                                                           |
+ * * ||  | |                                                           |
+ * * ||--| |                                                           |
+ * * |     |                                                           |
+ * * ||--| | subengine 2 (struct NFA + rest of subengine)      <-------/
+ * * ||  | |
+ * * ||--| |
+ * * ||  | |
+ * * ||  | |
+ * * ||--| |
+ * * |     |
+ * * ...
+ * * |     |
+ * * |-----| total size of tamarama
+ * *
+ * * Tamarama stream state:
+ * *
+ * * |---|
+ * * |   | active subengine id
+ * * |---|
+ * * |   | common pool of stream state for each engine
+ * * |   |
+ * * |   |
+ * * ...
+ * * |   |
+ * * |   |
+ * * |---|
+ * *
+ * * Tamarama scratch space:
+ * *
+ * * |---|
+ * * |   | common pool of scratch for each engine
+ * * |   |
+ * * |   |
+ * * ...
+ * * |   |
+ * * |   |
+ * * |---|
+ * */
+
+#ifndef NFA_TAMARAMA_INTERNAL_H
+#define NFA_TAMARAMA_INTERNAL_H
+
+#include "ue2common.h"
+
+struct ALIGN_AVX_DIRECTIVE Tamarama {
+    u32 numSubEngines;
+    u8 activeIdxSize;
+};
+
+#endif // NFA_TAMARAMA_INTERNAL_H
diff --git a/regex/nfa/truffle.c b/regex/nfa/truffle.c
new file mode 100644
index 000000000..be6b312cf
--- /dev/null
+++ b/regex/nfa/truffle.c
@@ -0,0 +1,608 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Matches a byte in a charclass using three shuffles
+ */
+
+
+#include "ue2common.h"
+#include "truffle.h"
+#include "util/arch.h"
+#include "util/bitutils.h"
+#include "util/simd_utils.h"
+
+#if !defined(HAVE_AVX2)
+
+static really_inline
+const u8 *lastMatch(const u8 *buf, u32 z) {
+    if (unlikely(z != 0xffff)) {
+        u32 pos = clz32(~z & 0xffff);
+        assert(pos >= 16 && pos < 32);
+        return buf + (31 - pos);
+    }
+
+    return NULL; // no match
+}
+
+static really_inline
+const u8 *firstMatch(const u8 *buf, u32 z) {
+    if (unlikely(z != 0xffff)) {
+        u32 pos = ctz32(~z & 0xffff);
+        assert(pos < 16);
+        return buf + pos;
+    }
+
+    return NULL; // no match
+}
+
+static really_inline
+u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) {
+
+    m128 highconst = _mm_set1_epi8(0x80);
+    m128 shuf_mask_hi = _mm_set1_epi64x(0x8040201008040201);
+
+    // and now do the real work
+    m128 shuf1 = pshufb_m128(shuf_mask_lo_highclear, v);
+    m128 t1 = xor128(v, highconst);
+    m128 shuf2 = pshufb_m128(shuf_mask_lo_highset, t1);
+    m128 t2 = andnot128(highconst, rshift64_m128(v, 4));
+    m128 shuf3 = pshufb_m128(shuf_mask_hi, t2);
+    m128 tmp = and128(or128(shuf1, shuf2), shuf3);
+    m128 tmp2 = eq128(tmp, zeroes128());
+    u32 z = movemask128(tmp2);
+
+    return z;
+}
+
+static
+const u8 *truffleMini(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
+                       const u8 *buf, const u8 *buf_end) {
+    uintptr_t len = buf_end - buf;
+    assert(len < 16);
+
+    m128 chars = zeroes128();
+    memcpy(&chars, buf, len);
+
+    u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
+    // can't be these bytes in z
+    u32 mask = (0xffff >> (16 - len)) ^ 0xffff;
+    const u8 *rv = firstMatch(buf, z | mask);
+
+    if (rv) {
+        return rv;
+    } else {
+        return buf_end;
+    }
+}
+
+static really_inline
+const u8 *fwdBlock(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
+                   m128 v, const u8 *buf) {
+    u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v);
+    return firstMatch(buf, z);
+}
+
+static really_inline
+const u8 *revBlock(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
+                   m128 v, const u8 *buf) {
+    u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v);
+    return lastMatch(buf, z);
+}
+
+const u8 *truffleExec(m128 shuf_mask_lo_highclear,
+                      m128 shuf_mask_lo_highset,
+                      const u8 *buf, const u8 *buf_end) {
+    DEBUG_PRINTF("len %zu\n", buf_end - buf);
+
+    assert(buf && buf_end);
+    assert(buf < buf_end);
+    const u8 *rv;
+
+    if (buf_end - buf < 16) {
+        return truffleMini(shuf_mask_lo_highclear, shuf_mask_lo_highset, buf,
+                           buf_end);
+    }
+
+    size_t min = (size_t)buf % 16;
+    assert(buf_end - buf >= 16);
+
+    // Preconditioning: most of the time our buffer won't be aligned.
+    m128 chars = loadu128(buf);
+    rv = fwdBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars, buf);
+    if (rv) {
+        return rv;
+    }
+    buf += (16 - min);
+
+    const u8 *last_block = buf_end - 16;
+    while (buf < last_block) {
+        m128 lchars = load128(buf);
+        rv = fwdBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, lchars,
+                      buf);
+        if (rv) {
+            return rv;
+        }
+        buf += 16;
+    }
+
+    // Use an unaligned load to mop up the last 16 bytes and get an accurate
+    // picture to buf_end.
+    assert(buf <= buf_end && buf >= buf_end - 16);
+    chars = loadu128(buf_end - 16);
+    rv = fwdBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars,
+                  buf_end - 16);
+    if (rv) {
+        return rv;
+    }
+
+    return buf_end;
+}
+
+static
+const u8 *truffleRevMini(m128 shuf_mask_lo_highclear,
+                         m128 shuf_mask_lo_highset, const u8 *buf,
+                         const u8 *buf_end) {
+    uintptr_t len = buf_end - buf;
+    assert(len < 16);
+
+    m128 chars = zeroes128();
+    memcpy(&chars, buf, len);
+
+    u32 mask = (0xffff >> (16 - len)) ^ 0xffff;
+    u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
+    const u8 *rv = lastMatch(buf, z | mask);
+
+    if (rv) {
+        return rv;
+    }
+    return buf - 1;
+}
+
+const u8 *rtruffleExec(m128 shuf_mask_lo_highclear,
+                       m128 shuf_mask_lo_highset,
+                       const u8 *buf, const u8 *buf_end) {
+    assert(buf && buf_end);
+    assert(buf < buf_end);
+    const u8 *rv;
+
+    DEBUG_PRINTF("len %zu\n", buf_end - buf);
+
+    if (buf_end - buf < 16) {
+        return truffleRevMini(shuf_mask_lo_highclear, shuf_mask_lo_highset, buf,
+                              buf_end);
+    }
+
+    assert(buf_end - buf >= 16);
+
+    // Preconditioning: most of the time our buffer won't be aligned.
+    m128 chars = loadu128(buf_end - 16);
+    rv = revBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars,
+                  buf_end - 16);
+    if (rv) {
+        return rv;
+    }
+    buf_end = (const u8 *)((size_t)buf_end & ~((size_t)0xf));
+
+    const u8 *last_block = buf + 16;
+    while (buf_end > last_block) {
+        buf_end -= 16;
+        m128 lchars = load128(buf_end);
+        rv = revBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, lchars,
+                      buf_end);
+        if (rv) {
+            return rv;
+        }
+    }
+
+    // Use an unaligned load to mop up the last 16 bytes and get an accurate
+    // picture to buf_end.
+    chars = loadu128(buf);
+    rv = revBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars, buf);
+    if (rv) {
+        return rv;
+    }
+
+    return buf - 1;
+}
+
+#elif !defined(HAVE_AVX512)
+
+// AVX2
+
+static really_inline
+const u8 *lastMatch(const u8 *buf, u32 z) {
+    if (unlikely(z != 0xffffffff)) {
+        u32 pos = clz32(~z);
+        assert(pos < 32);
+        return buf + (31 - pos);
+    }
+
+    return NULL; // no match
+}
+
+static really_inline
+const u8 *firstMatch(const u8 *buf, u32 z) {
+    if (unlikely(z != 0xffffffff)) {
+        u32 pos = ctz32(~z);
+        assert(pos < 32);
+        return buf + pos;
+    }
+
+    return NULL; // no match
+}
+
+static really_inline
+u32 block(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, m256 v) {
+
+    m256 highconst = _mm256_set1_epi8(0x80);
+    m256 shuf_mask_hi = _mm256_set1_epi64x(0x8040201008040201);
+
+    // and now do the real work
+    m256 shuf1 = pshufb_m256(shuf_mask_lo_highclear, v);
+    m256 t1 = xor256(v, highconst);
+    m256 shuf2 = pshufb_m256(shuf_mask_lo_highset, t1);
+    m256 t2 = andnot256(highconst, rshift64_m256(v, 4));
+    m256 shuf3 = pshufb_m256(shuf_mask_hi, t2);
+    m256 tmp = and256(or256(shuf1, shuf2), shuf3);
+    m256 tmp2 = eq256(tmp, zeroes256());
+    u32 z = movemask256(tmp2);
+
+    return z;
+}
+
+static
+const u8 *truffleMini(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset,
+                       const u8 *buf, const u8 *buf_end) {
+    uintptr_t len = buf_end - buf;
+    assert(len < 32);
+
+    m256 chars = zeroes256();
+    memcpy(&chars, buf, len);
+
+    u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
+    // can't be these bytes in z
+    u32 mask = (0xffffffff >> (32 - len)) ^ 0xffffffff;
+    const u8 *rv = firstMatch(buf, z | mask);
+
+    if (rv) {
+        return rv;
+    } else {
+        return buf_end;
+    }
+}
+
+static really_inline
+const u8 *fwdBlock(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset,
+                   m256 v, const u8 *buf) {
+    u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v);
+    return firstMatch(buf, z);
+}
+
+static really_inline
+const u8 *revBlock(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset,
+                   m256 v, const u8 *buf) {
+    u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v);
+    return lastMatch(buf, z);
+}
+
+const u8 *truffleExec(m128 shuf_mask_lo_highclear,
+                      m128 shuf_mask_lo_highset,
+                      const u8 *buf, const u8 *buf_end) {
+    DEBUG_PRINTF("len %zu\n", buf_end - buf);
+    const m256 wide_clear = set2x128(shuf_mask_lo_highclear);
+    const m256 wide_set = set2x128(shuf_mask_lo_highset);
+
+    assert(buf && buf_end);
+    assert(buf < buf_end);
+    const u8 *rv;
+
+    if (buf_end - buf < 32) {
+        return truffleMini(wide_clear, wide_set, buf, buf_end);
+    }
+
+    size_t min = (size_t)buf % 32;
+    assert(buf_end - buf >= 32);
+
+    // Preconditioning: most of the time our buffer won't be aligned.
+    m256 chars = loadu256(buf);
+    rv = fwdBlock(wide_clear, wide_set, chars, buf);
+    if (rv) {
+        return rv;
+    }
+    buf += (32 - min);
+
+    const u8 *last_block = buf_end - 32;
+    while (buf < last_block) {
+        m256 lchars = load256(buf);
+        rv = fwdBlock(wide_clear, wide_set, lchars, buf);
+        if (rv) {
+            return rv;
+        }
+        buf += 32;
+    }
+
+    // Use an unaligned load to mop up the last 32 bytes and get an accurate
+    // picture to buf_end.
+    assert(buf <= buf_end && buf >= buf_end - 32);
+    chars = loadu256(buf_end - 32);
+    rv = fwdBlock(wide_clear, wide_set, chars, buf_end - 32);
+    if (rv) {
+        return rv;
+    }
+    return buf_end;
+}
+
+static
+const u8 *truffleRevMini(m256 shuf_mask_lo_highclear,
+                         m256 shuf_mask_lo_highset, const u8 *buf,
+                         const u8 *buf_end) {
+    uintptr_t len = buf_end - buf;
+    assert(len < 32);
+
+    m256 chars = zeroes256();
+    memcpy(&chars, buf, len);
+
+    u32 mask = (0xffffffff >> (32 - len)) ^ 0xffffffff;
+    u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
+    const u8 *rv = lastMatch(buf, z | mask);
+
+    if (rv) {
+        return rv;
+    }
+    return buf - 1;
+}
+
+
+const u8 *rtruffleExec(m128 shuf_mask_lo_highclear,
+                       m128 shuf_mask_lo_highset,
+                       const u8 *buf, const u8 *buf_end) {
+    const m256 wide_clear = set2x128(shuf_mask_lo_highclear);
+    const m256 wide_set = set2x128(shuf_mask_lo_highset);
+    assert(buf && buf_end);
+    assert(buf < buf_end);
+    const u8 *rv;
+
+    DEBUG_PRINTF("len %zu\n", buf_end - buf);
+
+    if (buf_end - buf < 32) {
+        return truffleRevMini(wide_clear, wide_set, buf, buf_end);
+    }
+
+    assert(buf_end - buf >= 32);
+
+    // Preconditioning: most of the time our buffer won't be aligned.
+    m256 chars = loadu256(buf_end - 32);
+    rv = revBlock(wide_clear, wide_set, chars,
+                  buf_end - 32);
+    if (rv) {
+        return rv;
+    }
+    buf_end = (const u8 *)((size_t)buf_end & ~((size_t)0x1f));
+
+    const u8 *last_block = buf + 32;
+    while (buf_end > last_block) {
+        buf_end -= 32;
+        m256 lchars = load256(buf_end);
+        rv = revBlock(wide_clear, wide_set, lchars, buf_end);
+        if (rv) {
+            return rv;
+        }
+    }
+
+    // Use an unaligned load to mop up the last 32 bytes and get an accurate
+    // picture to buf_end.
+    chars = loadu256(buf);
+    rv = revBlock(wide_clear, wide_set, chars, buf);
+    if (rv) {
+        return rv;
+    }
+    return buf - 1;
+}
+
+#else // AVX512
+
+static really_inline
+const u8 *lastMatch(const u8 *buf, u64a z) {
+    if (unlikely(z != ~0ULL)) {
+        u64a pos = clz64(~z);
+        assert(pos < 64);
+        return buf + (63 - pos);
+    }
+
+    return NULL; // no match
+}
+
+static really_inline
+const u8 *firstMatch(const u8 *buf, u64a z) {
+    if (unlikely(z != ~0ULL)) {
+        u64a pos = ctz64(~z);
+        assert(pos < 64);
+        DEBUG_PRINTF("pos %llu\n", pos);
+        return buf + pos;
+    }
+
+    return NULL; // no match
+}
+
+static really_inline
+u64a block(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset, m512 v) {
+    m512 highconst = set64x8(0x80);
+    m512 shuf_mask_hi = set8x64(0x8040201008040201);
+
+    // and now do the real work
+    m512 shuf1 = pshufb_m512(shuf_mask_lo_highclear, v);
+    m512 t1 = xor512(v, highconst);
+    m512 shuf2 = pshufb_m512(shuf_mask_lo_highset, t1);
+    m512 t2 = andnot512(highconst, rshift64_m512(v, 4));
+    m512 shuf3 = pshufb_m512(shuf_mask_hi, t2);
+    m512 tmp = and512(or512(shuf1, shuf2), shuf3);
+    u64a z = eq512mask(tmp, zeroes512());
+
+    return z;
+}
+
+static really_inline
+const u8 *truffleMini(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset,
+                      const u8 *buf, const u8 *buf_end) {
+    uintptr_t len = buf_end - buf;
+    assert(len <= 64);
+
+    __mmask64 mask = (~0ULL) >> (64 - len);
+
+    m512 chars = loadu_maskz_m512(mask, buf);
+
+    u64a z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
+
+    const u8 *rv = firstMatch(buf, z | ~mask);
+
+    return rv;
+}
+
+static really_inline
+const u8 *fwdBlock(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset,
+                   m512 v, const u8 *buf) {
+    u64a z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v);
+    return firstMatch(buf, z);
+}
+
+static really_inline
+const u8 *revBlock(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset,
+                   m512 v, const u8 *buf) {
+    u64a z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v);
+    return lastMatch(buf, z);
+}
+
+const u8 *truffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
+                      const u8 *buf, const u8 *buf_end) {
+    DEBUG_PRINTF("len %zu\n", buf_end - buf);
+    const m512 wide_clear = set4x128(shuf_mask_lo_highclear);
+    const m512 wide_set = set4x128(shuf_mask_lo_highset);
+
+    assert(buf && buf_end);
+    assert(buf < buf_end);
+    const u8 *rv;
+
+    if (buf_end - buf <= 64) {
+        rv = truffleMini(wide_clear, wide_set, buf, buf_end);
+        return rv ? rv : buf_end;
+    }
+
+    assert(buf_end - buf >= 64);
+    if ((uintptr_t)buf % 64) {
+        // Preconditioning: most of the time our buffer won't be aligned.
+        rv = truffleMini(wide_clear, wide_set, buf, ROUNDUP_PTR(buf, 64));
+        if (rv) {
+            return rv;
+        }
+        buf = ROUNDUP_PTR(buf, 64);
+    }
+    const u8 *last_block = buf_end - 64;
+    while (buf < last_block) {
+        m512 lchars = load512(buf);
+        rv = fwdBlock(wide_clear, wide_set, lchars, buf);
+        if (rv) {
+            return rv;
+        }
+        buf += 64;
+    }
+
+    // Use an unaligned load to mop up the last 64 bytes and get an accurate
+    // picture to buf_end.
+    assert(buf <= buf_end && buf >= buf_end - 64);
+    m512 chars = loadu512(buf_end - 64);
+    rv = fwdBlock(wide_clear, wide_set, chars, buf_end - 64);
+    if (rv) {
+        return rv;
+    }
+    return buf_end;
+}
+
+static really_inline
+const u8 *truffleRevMini(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset,
+                         const u8 *buf, const u8 *buf_end) {
+    uintptr_t len = buf_end - buf;
+    assert(len < 64);
+
+    __mmask64 mask = (~0ULL) >> (64 - len);
+    m512 chars = loadu_maskz_m512(mask, buf);
+    u64a z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
+    DEBUG_PRINTF("mask 0x%016llx z 0x%016llx\n", mask, z);
+    const u8 *rv = lastMatch(buf, z | ~mask);
+
+    if (rv) {
+        return rv;
+    }
+    return buf - 1;
+}
+
+const u8 *rtruffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
+                       const u8 *buf, const u8 *buf_end) {
+    const m512 wide_clear = set4x128(shuf_mask_lo_highclear);
+    const m512 wide_set = set4x128(shuf_mask_lo_highset);
+    assert(buf && buf_end);
+    assert(buf < buf_end);
+    const u8 *rv;
+
+    DEBUG_PRINTF("len %zu\n", buf_end - buf);
+
+    if (buf_end - buf < 64) {
+        return truffleRevMini(wide_clear, wide_set, buf, buf_end);
+    }
+
+    assert(buf_end - buf >= 64);
+
+    // Preconditioning: most of the time our buffer won't be aligned.
+    m512 chars = loadu512(buf_end - 64);
+    rv = revBlock(wide_clear, wide_set, chars, buf_end - 64);
+    if (rv) {
+        return rv;
+    }
+    buf_end = (const u8 *)ROUNDDOWN_N((uintptr_t)buf_end, 64);
+
+    const u8 *last_block = buf + 64;
+    while (buf_end > last_block) {
+        buf_end -= 64;
+        m512 lchars = load512(buf_end);
+        rv = revBlock(wide_clear, wide_set, lchars, buf_end);
+        if (rv) {
+            return rv;
+        }
+    }
+
+    // Use an unaligned load to mop up the last 64 bytes and get an accurate
+    // picture to buf_end.
+    chars = loadu512(buf);
+    rv = revBlock(wide_clear, wide_set, chars, buf);
+    if (rv) {
+        return rv;
+    }
+    return buf - 1;
+}
+
+#endif
diff --git a/regex/nfa/truffle.h b/regex/nfa/truffle.h
new file mode 100644
index 000000000..f67227ad1
--- /dev/null
+++ b/regex/nfa/truffle.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Truffle: fully general character class acceleration.
+ *
+ * Utilises the SSSE3 pshufb or AVX2 vpshufb shuffle instructions
+ */
+
+#ifndef TRUFFLE_H
+#define TRUFFLE_H
+
+#include "util/simd_types.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+const u8 *truffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
+                      const u8 *buf, const u8 *buf_end);
+
+const u8 *rtruffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
+                       const u8 *buf, const u8 *buf_end);
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* TRUFFLE_H */
+
diff --git a/regex/nfa/vermicelli.h b/regex/nfa/vermicelli.h
new file mode 100644
index 000000000..ed797d83f
--- /dev/null
+++ b/regex/nfa/vermicelli.h
@@ -0,0 +1,518 @@
+/*
+ * Copyright (c) 2015-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Vermicelli: single-byte and double-byte acceleration.
+ */
+
+#ifndef VERMICELLI_H
+#define VERMICELLI_H
+
+#include "util/bitutils.h"
+#include "util/simd_utils.h"
+#include "util/unaligned.h"
+
+#include "vermicelli_sse.h"
+
+static really_inline
+const u8 *vermicelliExec(char c, char nocase, const u8 *buf,
+                         const u8 *buf_end) {
+    DEBUG_PRINTF("verm scan %s\\x%02hhx over %zu bytes\n",
+                 nocase ? "nocase " : "", c, (size_t)(buf_end - buf));
+    assert(buf < buf_end);
+
+    VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */
+
+    // Handle small scans.
+#ifdef HAVE_AVX512
+    if (buf_end - buf <= VERM_BOUNDARY) {
+        const u8 *ptr = nocase
+                      ? vermMiniNocase(chars, buf, buf_end, 0)
+                      : vermMini(chars, buf, buf_end, 0);
+        if (ptr) {
+            return ptr;
+        }
+        return buf_end;
+    }
+#else
+    if (buf_end - buf < VERM_BOUNDARY) {
+        for (; buf < buf_end; buf++) {
+            char cur = (char)*buf;
+            if (nocase) {
+                cur &= CASE_CLEAR;
+            }
+            if (cur == c) {
+                break;
+            }
+        }
+        return buf;
+    }
+#endif
+
+    uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY;
+    if (min) {
+        // Input isn't aligned, so we need to run one iteration with an
+        // unaligned load, then skip buf forward to the next aligned address.
+        // There's some small overlap here, but we don't mind scanning it twice
+        // if we can do it quickly, do we?
+        const u8 *ptr = nocase ? vermUnalignNocase(chars, buf, 0)
+                               : vermUnalign(chars, buf, 0);
+        if (ptr) {
+            return ptr;
+        }
+
+        buf += VERM_BOUNDARY - min;
+        assert(buf < buf_end);
+    }
+
+    // Aligned loops from here on in
+    const u8 *ptr = nocase ? vermSearchAlignedNocase(chars, buf, buf_end - 1, 0)
+                           : vermSearchAligned(chars, buf, buf_end - 1, 0);
+    if (ptr) {
+        return ptr;
+    }
+
+    // Tidy up the mess at the end
+    ptr = nocase ? vermUnalignNocase(chars, buf_end - VERM_BOUNDARY, 0)
+                 : vermUnalign(chars, buf_end - VERM_BOUNDARY, 0);
+    return ptr ? ptr : buf_end;
+}
+
+/* like vermicelliExec except returns the address of the first character which
+ * is not c */
+static really_inline
+const u8 *nvermicelliExec(char c, char nocase, const u8 *buf,
+                         const u8 *buf_end) {
+    DEBUG_PRINTF("nverm scan %s\\x%02hhx over %zu bytes\n",
+                 nocase ? "nocase " : "", c, (size_t)(buf_end - buf));
+    assert(buf < buf_end);
+
+    VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */
+
+    // Handle small scans.
+#ifdef HAVE_AVX512
+    if (buf_end - buf <= VERM_BOUNDARY) {
+        const u8 *ptr = nocase
+                      ? vermMiniNocase(chars, buf, buf_end, 1)
+                      : vermMini(chars, buf, buf_end, 1);
+        if (ptr) {
+            return ptr;
+        }
+        return buf_end;
+    }
+#else
+    if (buf_end - buf < VERM_BOUNDARY) {
+        for (; buf < buf_end; buf++) {
+            char cur = (char)*buf;
+            if (nocase) {
+                cur &= CASE_CLEAR;
+            }
+            if (cur != c) {
+                break;
+            }
+        }
+        return buf;
+    }
+#endif
+
+    size_t min = (size_t)buf % VERM_BOUNDARY;
+    if (min) {
+        // Input isn't aligned, so we need to run one iteration with an
+        // unaligned load, then skip buf forward to the next aligned address.
+        // There's some small overlap here, but we don't mind scanning it twice
+        // if we can do it quickly, do we?
+        const u8 *ptr = nocase ? vermUnalignNocase(chars, buf, 1)
+                               : vermUnalign(chars, buf, 1);
+        if (ptr) {
+            return ptr;
+        }
+
+        buf += VERM_BOUNDARY - min;
+        assert(buf < buf_end);
+    }
+
+    // Aligned loops from here on in
+    const u8 *ptr = nocase ? vermSearchAlignedNocase(chars, buf, buf_end - 1, 1)
+                           : vermSearchAligned(chars, buf, buf_end - 1, 1);
+    if (ptr) {
+        return ptr;
+    }
+
+    // Tidy up the mess at the end
+    ptr = nocase ? vermUnalignNocase(chars, buf_end - VERM_BOUNDARY, 1)
+                 : vermUnalign(chars, buf_end - VERM_BOUNDARY, 1);
+    return ptr ? ptr : buf_end;
+}
+
+static really_inline
+const u8 *vermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf,
+                               const u8 *buf_end) {
+    DEBUG_PRINTF("double verm scan %s\\x%02hhx%02hhx over %zu bytes\n",
+                 nocase ? "nocase " : "", c1, c2, (size_t)(buf_end - buf));
+    assert(buf < buf_end);
+
+    VERM_TYPE chars1 = VERM_SET_FN(c1); /* nocase already uppercase */
+    VERM_TYPE chars2 = VERM_SET_FN(c2); /* nocase already uppercase */
+
+#ifdef HAVE_AVX512
+    if (buf_end - buf <= VERM_BOUNDARY) {
+        const u8 *ptr = nocase
+                      ? dvermMiniNocase(chars1, chars2, buf, buf_end)
+                      : dvermMini(chars1, chars2, buf, buf_end);
+        if (ptr) {
+            return ptr;
+        }
+
+        /* check for partial match at end */
+        u8 mask = nocase ? CASE_CLEAR : 0xff;
+        if ((buf_end[-1] & mask) == (u8)c1) {
+            DEBUG_PRINTF("partial!!!\n");
+            return buf_end - 1;
+        }
+
+        return buf_end;
+    }
+#endif
+
+    assert((buf_end - buf) >= VERM_BOUNDARY);
+    uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY;
+    if (min) {
+        // Input isn't aligned, so we need to run one iteration with an
+        // unaligned load, then skip buf forward to the next aligned address.
+        // There's some small overlap here, but we don't mind scanning it twice
+        // if we can do it quickly, do we?
+        const u8 *ptr = nocase
+                        ? dvermPreconditionNocase(chars1, chars2, buf)
+                        : dvermPrecondition(chars1, chars2, buf);
+        if (ptr) {
+            return ptr;
+        }
+
+        buf += VERM_BOUNDARY - min;
+        assert(buf < buf_end);
+    }
+
+    // Aligned loops from here on in
+    const u8 *ptr = nocase ? dvermSearchAlignedNocase(chars1, chars2, c1, c2,
+                                                      buf, buf_end)
+                           : dvermSearchAligned(chars1, chars2, c1, c2, buf,
+                                                buf_end);
+    if (ptr) {
+        return ptr;
+    }
+
+    // Tidy up the mess at the end
+    ptr = nocase ? dvermPreconditionNocase(chars1, chars2,
+                                           buf_end - VERM_BOUNDARY)
+                 : dvermPrecondition(chars1, chars2, buf_end - VERM_BOUNDARY);
+
+    if (ptr) {
+        return ptr;
+    }
+
+    /* check for partial match at end */
+    u8 mask = nocase ? CASE_CLEAR : 0xff;
+    if ((buf_end[-1] & mask) == (u8)c1) {
+        DEBUG_PRINTF("partial!!!\n");
+        return buf_end - 1;
+    }
+
+    return buf_end;
+}
+
+static really_inline
+const u8 *vermicelliDoubleMaskedExec(char c1, char c2, char m1, char m2,
+                                     const u8 *buf, const u8 *buf_end) {
+    DEBUG_PRINTF("double verm scan (\\x%02hhx&\\x%02hhx)(\\x%02hhx&\\x%02hhx) "
+                 "over %zu bytes\n", c1, m1, c2, m2, (size_t)(buf_end - buf));
+    assert(buf < buf_end);
+
+    VERM_TYPE chars1 = VERM_SET_FN(c1);
+    VERM_TYPE chars2 = VERM_SET_FN(c2);
+    VERM_TYPE mask1 = VERM_SET_FN(m1);
+    VERM_TYPE mask2 = VERM_SET_FN(m2);
+
+#ifdef HAVE_AVX512
+    if (buf_end - buf <= VERM_BOUNDARY) {
+        const u8 *ptr = dvermMiniMasked(chars1, chars2, mask1, mask2, buf,
+                                        buf_end);
+        if (ptr) {
+            return ptr;
+        }
+
+        /* check for partial match at end */
+        if ((buf_end[-1] & m1) == (u8)c1) {
+            DEBUG_PRINTF("partial!!!\n");
+            return buf_end - 1;
+        }
+
+        return buf_end;
+    }
+#endif
+
+    assert((buf_end - buf) >= VERM_BOUNDARY);
+    uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY;
+    if (min) {
+        // Input isn't aligned, so we need to run one iteration with an
+        // unaligned load, then skip buf forward to the next aligned address.
+        // There's some small overlap here, but we don't mind scanning it twice
+        // if we can do it quickly, do we?
+        const u8 *p = dvermPreconditionMasked(chars1, chars2, mask1, mask2, buf);
+        if (p) {
+            return p;
+        }
+
+        buf += VERM_BOUNDARY - min;
+        assert(buf < buf_end);
+    }
+
+    // Aligned loops from here on in
+    const u8 *ptr = dvermSearchAlignedMasked(chars1, chars2, mask1, mask2, c1,
+                                             c2, m1, m2, buf, buf_end);
+    if (ptr) {
+        return ptr;
+    }
+
+    // Tidy up the mess at the end
+    ptr = dvermPreconditionMasked(chars1, chars2, mask1, mask2,
+                                  buf_end - VERM_BOUNDARY);
+
+    if (ptr) {
+        return ptr;
+    }
+
+    /* check for partial match at end */
+    if ((buf_end[-1] & m1) == (u8)c1) {
+        DEBUG_PRINTF("partial!!!\n");
+        return buf_end - 1;
+    }
+
+    return buf_end;
+}
+
+// Reverse vermicelli scan. Provides exact semantics and returns (buf - 1) if
+// character not found.
+static really_inline
+const u8 *rvermicelliExec(char c, char nocase, const u8 *buf,
+                          const u8 *buf_end) {
+    DEBUG_PRINTF("rev verm scan %s\\x%02hhx over %zu bytes\n",
+                 nocase ? "nocase " : "", c, (size_t)(buf_end - buf));
+    assert(buf < buf_end);
+
+    VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */
+
+    // Handle small scans.
+#ifdef HAVE_AVX512
+    if (buf_end - buf <= VERM_BOUNDARY) {
+        const u8 *ptr = nocase
+                      ? rvermMiniNocase(chars, buf, buf_end, 0)
+                      : rvermMini(chars, buf, buf_end, 0);
+        if (ptr) {
+            return ptr;
+        }
+        return buf - 1;
+    }
+#else
+    if (buf_end - buf < VERM_BOUNDARY) {
+        for (buf_end--; buf_end >= buf; buf_end--) {
+            char cur = (char)*buf_end;
+            if (nocase) {
+                cur &= CASE_CLEAR;
+            }
+            if (cur == c) {
+                break;
+            }
+        }
+        return buf_end;
+    }
+#endif
+
+    size_t min = (size_t)buf_end % VERM_BOUNDARY;
+    if (min) {
+        // Input isn't aligned, so we need to run one iteration with an
+        // unaligned load, then skip buf backward to the next aligned address.
+        // There's some small overlap here, but we don't mind scanning it twice
+        // if we can do it quickly, do we?
+        const u8 *ptr = nocase ? rvermUnalignNocase(chars,
+                                                    buf_end - VERM_BOUNDARY,
+                                                    0)
+                               : rvermUnalign(chars, buf_end - VERM_BOUNDARY,
+                                              0);
+
+        if (ptr) {
+            return ptr;
+        }
+
+        buf_end -= min;
+        if (buf >= buf_end) {
+            return buf_end;
+        }
+    }
+
+    // Aligned loops from here on in.
+    const u8 *ptr = nocase ? rvermSearchAlignedNocase(chars, buf, buf_end, 0)
+                           : rvermSearchAligned(chars, buf, buf_end, 0);
+    if (ptr) {
+        return ptr;
+    }
+
+    // Tidy up the mess at the end, return buf - 1 if not found.
+    ptr = nocase ? rvermUnalignNocase(chars, buf, 0)
+                 : rvermUnalign(chars, buf, 0);
+    return ptr ? ptr : buf - 1;
+}
+
+/* like rvermicelliExec except returns the address of the last character which
+ * is not c */
+static really_inline
+const u8 *rnvermicelliExec(char c, char nocase, const u8 *buf,
+                           const u8 *buf_end) {
+    DEBUG_PRINTF("rev verm scan %s\\x%02hhx over %zu bytes\n",
+                 nocase ? "nocase " : "", c, (size_t)(buf_end - buf));
+    assert(buf < buf_end);
+
+    VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */
+
+    // Handle small scans.
+#ifdef HAVE_AVX512
+    if (buf_end - buf <= VERM_BOUNDARY) {
+        const u8 *ptr = nocase
+                      ? rvermMiniNocase(chars, buf, buf_end, 1)
+                      : rvermMini(chars, buf, buf_end, 1);
+        if (ptr) {
+            return ptr;
+        }
+        return buf - 1;
+    }
+#else
+    if (buf_end - buf < VERM_BOUNDARY) {
+        for (buf_end--; buf_end >= buf; buf_end--) {
+            char cur = (char)*buf_end;
+            if (nocase) {
+                cur &= CASE_CLEAR;
+            }
+            if (cur != c) {
+                break;
+            }
+        }
+        return buf_end;
+    }
+#endif
+
+    size_t min = (size_t)buf_end % VERM_BOUNDARY;
+    if (min) {
+        // Input isn't aligned, so we need to run one iteration with an
+        // unaligned load, then skip buf backward to the next aligned address.
+        // There's some small overlap here, but we don't mind scanning it twice
+        // if we can do it quickly, do we?
+        const u8 *ptr = nocase ? rvermUnalignNocase(chars,
+                                                    buf_end - VERM_BOUNDARY,
+                                                    1)
+                               : rvermUnalign(chars, buf_end - VERM_BOUNDARY,
+                                              1);
+
+        if (ptr) {
+            return ptr;
+        }
+
+        buf_end -= min;
+        if (buf >= buf_end) {
+            return buf_end;
+        }
+    }
+
+    // Aligned loops from here on in.
+    const u8 *ptr = nocase ? rvermSearchAlignedNocase(chars, buf, buf_end, 1)
+                           : rvermSearchAligned(chars, buf, buf_end, 1);
+    if (ptr) {
+        return ptr;
+    }
+
+    // Tidy up the mess at the end, return buf - 1 if not found.
+    ptr = nocase ? rvermUnalignNocase(chars, buf, 1)
+                 : rvermUnalign(chars, buf, 1);
+    return ptr ? ptr : buf - 1;
+}
+
+/* returns highest offset of c2 (NOTE: not c1) */
+static really_inline
+const u8 *rvermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf,
+                                const u8 *buf_end) {
+    DEBUG_PRINTF("rev double verm scan %s\\x%02hhx%02hhx over %zu bytes\n",
+                 nocase ? "nocase " : "", c1, c2, (size_t)(buf_end - buf));
+    assert(buf < buf_end);
+
+    VERM_TYPE chars1 = VERM_SET_FN(c1); /* nocase already uppercase */
+    VERM_TYPE chars2 = VERM_SET_FN(c2); /* nocase already uppercase */
+
+#ifdef HAVE_AVX512
+    if (buf_end - buf <= VERM_BOUNDARY) {
+        const u8 *ptr = nocase
+                      ? rdvermMiniNocase(chars1, chars2, buf, buf_end)
+                      : rdvermMini(chars1, chars2, buf, buf_end);
+
+        if (ptr) {
+            return ptr;
+        }
+
+        // check for partial match at end ???
+        return buf - 1;
+    }
+#endif
+
+    assert((buf_end - buf) >= VERM_BOUNDARY);
+    size_t min = (size_t)buf_end % VERM_BOUNDARY;
+    if (min) {
+        // input not aligned, so we need to run one iteration with an unaligned
+        // load, then skip buf forward to the next aligned address. There's
+        // some small overlap here, but we don't mind scanning it twice if we
+        // can do it quickly, do we?
+        const u8 *ptr = nocase ? rdvermPreconditionNocase(chars1, chars2,
+                                                          buf_end - VERM_BOUNDARY)
+                               : rdvermPrecondition(chars1, chars2,
+                                                    buf_end - VERM_BOUNDARY);
+
+        if (ptr) {
+            return ptr;
+        }
+
+        buf_end -= min;
+        if (buf >= buf_end) {
+            return buf_end;
+        }
+    }
+
+    // Aligned loops from here on in
+    if (nocase) {
+        return rdvermSearchAlignedNocase(chars1, chars2, c1, c2, buf, buf_end);
+    } else {
+        return rdvermSearchAligned(chars1, chars2, c1, c2, buf, buf_end);
+    }
+}
+
+#endif /* VERMICELLI_H */
diff --git a/regex/nfa/vermicelli_run.h b/regex/nfa/vermicelli_run.h
new file mode 100644
index 000000000..d6fe7ec78
--- /dev/null
+++ b/regex/nfa/vermicelli_run.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "vermicelli.h"
+
+static really_inline
+const u8 *find_xverm_run(char c, char nocase, u32 repeat, UNUSED const u8 *buf,
+                         const u8 *buf_start, const u8 *buf_end, char negate) {
+    DEBUG_PRINTF("looking for 0x%hhx{%u} in %p [%zd, %zd)\n", c, repeat, buf,
+                  buf_start - buf, buf_end - buf);
+
+    /* TODO optimise on where it is easy to get a dense bitfield of character
+     * matches */
+    if (repeat == 1) {
+        return negate ? nvermicelliExec(c, nocase, buf_start, buf_end)
+                      : vermicelliExec(c, nocase, buf_start, buf_end);
+    }
+
+    while (1) {
+        const u8 *s;
+        if (negate) {
+            s = nvermicelliExec(c, nocase, buf_start, buf_end);
+        } else if (buf_end - buf_start >= VERM_BOUNDARY && !nocase) {
+            s = vermicelliDoubleExec(c, c, nocase, buf_start, buf_end);
+
+            if (s != buf_end && *s != c) { /* double verm is not certain to be
+                                            * precise */
+                s = vermicelliExec(c, nocase, s, buf_end);
+            }
+        } else {
+            s = vermicelliExec(c, nocase, buf_start, buf_end);
+        }
+        if (s == buf_end) {
+            return s;
+        }
+
+        DEBUG_PRINTF("cand %zd\n", s - buf);
+
+        const u8 *test_e = MIN(s + repeat, buf_end);
+
+        const u8 *rv = negate ? vermicelliExec(c, nocase, s, test_e)
+                              : nvermicelliExec(c, nocase, s, test_e);
+
+        assert(rv > buf_start);
+        assert(rv <= buf_end);
+
+        if (rv == test_e) {
+            return s;
+        }
+
+        buf_start = rv;
+    }
+}
+
+static really_inline
+const u8 *find_verm_run(char c, char nocase, u32 repeat, const u8 *buf,
+                        const u8 *buf_start, const u8 *buf_end) {
+    return find_xverm_run(c, nocase, repeat, buf, buf_start, buf_end, 0);
+}
+
+static really_inline
+const u8 *find_nverm_run(char c, char nocase, u32 repeat, const u8 *buf,
+                         const u8 *buf_start, const u8 *buf_end) {
+    return find_xverm_run(c, nocase, repeat, buf, buf_start, buf_end, 1);
+}
diff --git a/regex/nfa/vermicelli_sse.h b/regex/nfa/vermicelli_sse.h
new file mode 100644
index 000000000..3307486cf
--- /dev/null
+++ b/regex/nfa/vermicelli_sse.h
@@ -0,0 +1,889 @@
+/*
+ * Copyright (c) 2015-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Vermicelli: Intel SSE implementation.
+ *
+ * (users should include vermicelli.h)
+ */
+
+#if !defined(HAVE_AVX512)
+
+#define VERM_BOUNDARY 16
+#define VERM_TYPE m128
+#define VERM_SET_FN set16x8
+
+static really_inline
+const u8 *vermSearchAligned(m128 chars, const u8 *buf, const u8 *buf_end,
+                            char negate) {
+    assert((size_t)buf % 16 == 0);
+    for (; buf + 31 < buf_end; buf += 32) {
+        m128 data = load128(buf);
+        u32 z1 = movemask128(eq128(chars, data));
+        m128 data2 = load128(buf + 16);
+        u32 z2 = movemask128(eq128(chars, data2));
+        u32 z = z1 | (z2 << 16);
+        if (negate) {
+            z = ~z;
+        }
+        if (unlikely(z)) {
+            u32 pos = ctz32(z);
+            return buf + pos;
+        }
+    }
+    for (; buf + 15 < buf_end; buf += 16) {
+        m128 data = load128(buf);
+        u32 z = movemask128(eq128(chars, data));
+        if (negate) {
+            z = ~z & 0xffff;
+        }
+        if (unlikely(z)) {
+            u32 pos = ctz32(z);
+            return buf + pos;
+        }
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *vermSearchAlignedNocase(m128 chars, const u8 *buf,
+                                  const u8 *buf_end, char negate) {
+    assert((size_t)buf % 16 == 0);
+    m128 casemask = set16x8(CASE_CLEAR);
+
+    for (; buf + 31 < buf_end; buf += 32) {
+        m128 data = load128(buf);
+        u32 z1 = movemask128(eq128(chars, and128(casemask, data)));
+        m128 data2 = load128(buf + 16);
+        u32 z2 = movemask128(eq128(chars, and128(casemask, data2)));
+        u32 z = z1 | (z2 << 16);
+        if (negate) {
+            z = ~z;
+        }
+        if (unlikely(z)) {
+            u32 pos = ctz32(z);
+            return buf + pos;
+        }
+    }
+
+    for (; buf + 15 < buf_end; buf += 16) {
+        m128 data = load128(buf);
+        u32 z = movemask128(eq128(chars, and128(casemask, data)));
+        if (negate) {
+            z = ~z & 0xffff;
+        }
+        if (unlikely(z)) {
+            u32 pos = ctz32(z);
+            return buf + pos;
+        }
+    }
+    return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *vermUnalign(m128 chars, const u8 *buf, char negate) {
+    m128 data = loadu128(buf); // unaligned
+    u32 z = movemask128(eq128(chars, data));
+    if (negate) {
+        z = ~z & 0xffff;
+    }
+    if (unlikely(z)) {
+        return buf + ctz32(z);
+    }
+    return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *vermUnalignNocase(m128 chars, const u8 *buf, char negate) {
+    m128 casemask = set16x8(CASE_CLEAR);
+    m128 data = loadu128(buf); // unaligned
+    u32 z = movemask128(eq128(chars, and128(casemask, data)));
+    if (negate) {
+        z = ~z & 0xffff;
+    }
+    if (unlikely(z)) {
+        return buf + ctz32(z);
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *dvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2,
+                             const u8 *buf, const u8 *buf_end) {
+    for (; buf + 16 < buf_end; buf += 16) {
+        m128 data = load128(buf);
+        u32 z = movemask128(and128(eq128(chars1, data),
+                                   rshiftbyte_m128(eq128(chars2, data), 1)));
+        if (buf[15] == c1 && buf[16] == c2) {
+            z |= (1 << 15);
+        }
+        if (unlikely(z)) {
+            u32 pos = ctz32(z);
+            return buf + pos;
+        }
+    }
+
+    return NULL;
+}
+
+static really_inline
+const u8 *dvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2,
+                                   const u8 *buf, const u8 *buf_end) {
+    assert((size_t)buf % 16 == 0);
+    m128 casemask = set16x8(CASE_CLEAR);
+
+    for (; buf + 16 < buf_end; buf += 16) {
+        m128 data = load128(buf);
+        m128 v = and128(casemask, data);
+        u32 z = movemask128(and128(eq128(chars1, v),
+                                   rshiftbyte_m128(eq128(chars2, v), 1)));
+        if ((buf[15] & CASE_CLEAR) == c1 && (buf[16] & CASE_CLEAR) == c2) {
+            z |= (1 << 15);
+        }
+        if (unlikely(z)) {
+            u32 pos = ctz32(z);
+            return buf + pos;
+        }
+    }
+
+    return NULL;
+}
+
+static really_inline
+const u8 *dvermSearchAlignedMasked(m128 chars1, m128 chars2,
+                                   m128 mask1, m128 mask2, u8 c1, u8 c2, u8 m1,
+                                   u8 m2, const u8 *buf, const u8 *buf_end) {
+    assert((size_t)buf % 16 == 0);
+
+    for (; buf + 16 < buf_end; buf += 16) {
+        m128 data = load128(buf);
+        m128 v1 = eq128(chars1, and128(data, mask1));
+        m128 v2 = eq128(chars2, and128(data, mask2));
+        u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1)));
+
+        if ((buf[15] & m1) == c1 && (buf[16] & m2) == c2) {
+            z |= (1 << 15);
+        }
+        if (unlikely(z)) {
+            u32 pos = ctz32(z);
+            return buf + pos;
+        }
+    }
+
+    return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *dvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) {
+    m128 data = loadu128(buf); // unaligned
+    u32 z = movemask128(and128(eq128(chars1, data),
+                               rshiftbyte_m128(eq128(chars2, data), 1)));
+
+    /* no fixup of the boundary required - the aligned run will pick it up */
+    if (unlikely(z)) {
+        u32 pos = ctz32(z);
+        return buf + pos;
+    }
+    return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *dvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
+    /* due to laziness, nonalphas and nocase having interesting behaviour */
+    m128 casemask = set16x8(CASE_CLEAR);
+    m128 data = loadu128(buf); // unaligned
+    m128 v = and128(casemask, data);
+    u32 z = movemask128(and128(eq128(chars1, v),
+                               rshiftbyte_m128(eq128(chars2, v), 1)));
+
+    /* no fixup of the boundary required - the aligned run will pick it up */
+    if (unlikely(z)) {
+        u32 pos = ctz32(z);
+        return buf + pos;
+    }
+    return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *dvermPreconditionMasked(m128 chars1, m128 chars2,
+                                  m128 mask1, m128 mask2, const u8 *buf) {
+    m128 data = loadu128(buf); // unaligned
+    m128 v1 = eq128(chars1, and128(data, mask1));
+    m128 v2 = eq128(chars2, and128(data, mask2));
+    u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1)));
+
+    /* no fixup of the boundary required - the aligned run will pick it up */
+    if (unlikely(z)) {
+        u32 pos = ctz32(z);
+        return buf + pos;
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *lastMatchOffset(const u8 *buf_end, u32 z) {
+    assert(z);
+    return buf_end - 16 + 31 - clz32(z);
+}
+
+static really_inline
+const u8 *rvermSearchAligned(m128 chars, const u8 *buf, const u8 *buf_end,
+                             char negate) {
+    assert((size_t)buf_end % 16 == 0);
+    for (; buf + 15 < buf_end; buf_end -= 16) {
+        m128 data = load128(buf_end - 16);
+        u32 z = movemask128(eq128(chars, data));
+        if (negate) {
+            z = ~z & 0xffff;
+        }
+        if (unlikely(z)) {
+            return lastMatchOffset(buf_end, z);
+        }
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *rvermSearchAlignedNocase(m128 chars, const u8 *buf,
+                                   const u8 *buf_end, char negate) {
+    assert((size_t)buf_end % 16 == 0);
+    m128 casemask = set16x8(CASE_CLEAR);
+
+    for (; buf + 15 < buf_end; buf_end -= 16) {
+        m128 data = load128(buf_end - 16);
+        u32 z = movemask128(eq128(chars, and128(casemask, data)));
+        if (negate) {
+            z = ~z & 0xffff;
+        }
+        if (unlikely(z)) {
+            return lastMatchOffset(buf_end, z);
+        }
+    }
+    return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *rvermUnalign(m128 chars, const u8 *buf, char negate) {
+    m128 data = loadu128(buf); // unaligned
+    u32 z = movemask128(eq128(chars, data));
+    if (negate) {
+        z = ~z & 0xffff;
+    }
+    if (unlikely(z)) {
+        return lastMatchOffset(buf + 16, z);
+    }
+    return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *rvermUnalignNocase(m128 chars, const u8 *buf, char negate) {
+    m128 casemask = set16x8(CASE_CLEAR);
+    m128 data = loadu128(buf); // unaligned
+    u32 z = movemask128(eq128(chars, and128(casemask, data)));
+    if (negate) {
+        z = ~z & 0xffff;
+    }
+    if (unlikely(z)) {
+        return lastMatchOffset(buf + 16, z);
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *rdvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2,
+                              const u8 *buf, const u8 *buf_end) {
+    assert((size_t)buf_end % 16 == 0);
+
+    for (; buf + 16 < buf_end; buf_end -= 16) {
+        m128 data = load128(buf_end - 16);
+        u32 z = movemask128(and128(eq128(chars2, data),
+                                   lshiftbyte_m128(eq128(chars1, data), 1)));
+        if (buf_end[-17] == c1 && buf_end[-16] == c2) {
+            z |= 1;
+        }
+        if (unlikely(z)) {
+            return lastMatchOffset(buf_end, z);
+        }
+    }
+    return buf_end;
+}
+
+static really_inline
+const u8 *rdvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2,
+                                    const u8 *buf, const u8 *buf_end) {
+    assert((size_t)buf_end % 16 == 0);
+    m128 casemask = set16x8(CASE_CLEAR);
+
+    for (; buf + 16 < buf_end; buf_end -= 16) {
+        m128 data = load128(buf_end - 16);
+        m128 v = and128(casemask, data);
+        u32 z = movemask128(and128(eq128(chars2, v),
+                                   lshiftbyte_m128(eq128(chars1, v), 1)));
+        if ((buf_end[-17] & CASE_CLEAR) == c1
+            && (buf_end[-16] & CASE_CLEAR) == c2) {
+            z |= 1;
+        }
+        if (unlikely(z)) {
+            return lastMatchOffset(buf_end, z);
+        }
+    }
+    return buf_end;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *rdvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) {
+    m128 data = loadu128(buf);
+    u32 z = movemask128(and128(eq128(chars2, data),
+                               lshiftbyte_m128(eq128(chars1, data), 1)));
+
+    /* no fixup of the boundary required - the aligned run will pick it up */
+    if (unlikely(z)) {
+        return lastMatchOffset(buf + 16, z);
+    }
+
+    return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
+    /* due to laziness, nonalphas and nocase having interesting behaviour */
+    m128 casemask = set16x8(CASE_CLEAR);
+    m128 data = loadu128(buf);
+    m128 v = and128(casemask, data);
+    u32 z = movemask128(and128(eq128(chars2, v),
+                               lshiftbyte_m128(eq128(chars1, v), 1)));
+    /* no fixup of the boundary required - the aligned run will pick it up */
+    if (unlikely(z)) {
+        return lastMatchOffset(buf + 16, z);
+    }
+
+    return NULL;
+}
+
+#else // HAVE_AVX512
+
+#define VERM_BOUNDARY 64
+#define VERM_TYPE m512
+#define VERM_SET_FN set64x8
+
+static really_inline
+const u8 *vermMini(m512 chars, const u8 *buf, const u8 *buf_end, char negate) {
+    uintptr_t len = buf_end - buf;
+    __mmask64 mask = (~0ULL) >> (64 - len);
+    m512 data = loadu_maskz_m512(mask, buf);
+
+    u64a z = eq512mask(chars, data);
+
+    if (negate) {
+        z = ~z & mask;
+    }
+    z &= mask;
+    if (unlikely(z)) {
+        return buf + ctz64(z);
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *vermMiniNocase(m512 chars, const u8 *buf, const u8 *buf_end,
+                         char negate) {
+    uintptr_t len = buf_end - buf;
+    __mmask64 mask = (~0ULL) >> (64 - len);
+    m512 data = loadu_maskz_m512(mask, buf);
+    m512 casemask = set64x8(CASE_CLEAR);
+    m512 v = and512(casemask, data);
+
+    u64a z = eq512mask(chars, v);
+
+    if (negate) {
+        z = ~z & mask;
+    }
+    z &= mask;
+    if (unlikely(z)) {
+        return buf + ctz64(z);
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *vermSearchAligned(m512 chars, const u8 *buf, const u8 *buf_end,
+                            char negate) {
+    assert((size_t)buf % 64 == 0);
+    for (; buf + 63 < buf_end; buf += 64) {
+        m512 data = load512(buf);
+        u64a z = eq512mask(chars, data);
+        if (negate) {
+            z = ~z & ~0ULL;
+        }
+        if (unlikely(z)) {
+            u64a pos = ctz64(z);
+            return buf + pos;
+        }
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *vermSearchAlignedNocase(m512 chars, const u8 *buf,
+                                  const u8 *buf_end, char negate) {
+    assert((size_t)buf % 64 == 0);
+    m512 casemask = set64x8(CASE_CLEAR);
+
+    for (; buf + 63 < buf_end; buf += 64) {
+        m512 data = load512(buf);
+        u64a z = eq512mask(chars, and512(casemask, data));
+        if (negate) {
+            z = ~z & ~0ULL;
+        }
+        if (unlikely(z)) {
+            u64a pos = ctz64(z);
+            return buf + pos;
+        }
+    }
+    return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *vermUnalign(m512 chars, const u8 *buf, char negate) {
+    m512 data = loadu512(buf); // unaligned
+    u64a z = eq512mask(chars, data);
+    if (negate) {
+        z = ~z & ~0ULL;
+    }
+    if (unlikely(z)) {
+        return buf + ctz64(z);
+    }
+    return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *vermUnalignNocase(m512 chars, const u8 *buf, char negate) {
+    m512 casemask = set64x8(CASE_CLEAR);
+    m512 data = loadu512(buf); // unaligned
+    u64a z = eq512mask(chars, and512(casemask, data));
+    if (negate) {
+        z = ~z & ~0ULL;
+    }
+    if (unlikely(z)) {
+        return buf + ctz64(z);
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *dvermMini(m512 chars1, m512 chars2, const u8 *buf,
+                    const u8 *buf_end) {
+    uintptr_t len = buf_end - buf;
+    __mmask64 mask = (~0ULL) >> (64 - len);
+    m512 data = loadu_maskz_m512(mask, buf);
+
+    u64a z = eq512mask(chars1, data) & (eq512mask(chars2, data) >> 1);
+
+    z &= mask;
+    if (unlikely(z)) {
+        u64a pos = ctz64(z);
+        return buf + pos;
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *dvermMiniNocase(m512 chars1, m512 chars2, const u8 *buf,
+                          const u8 *buf_end) {
+    uintptr_t len = buf_end - buf;
+    __mmask64 mask = (~0ULL) >> (64 - len);
+    m512 data = loadu_maskz_m512(mask, buf);
+    m512 casemask = set64x8(CASE_CLEAR);
+    m512 v = and512(casemask, data);
+
+    u64a z = eq512mask(chars1, v) & (eq512mask(chars2, v) >> 1);
+
+    z &= mask;
+    if (unlikely(z)) {
+        u64a pos = ctz64(z);
+        return buf + pos;
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *dvermMiniMasked(m512 chars1, m512 chars2, m512 mask1, m512 mask2,
+                          const u8 *buf, const u8 *buf_end) {
+    uintptr_t len = buf_end - buf;
+    __mmask64 mask = (~0ULL) >> (64 - len);
+    m512 data = loadu_maskz_m512(mask, buf);
+    m512 v1 = and512(data, mask1);
+    m512 v2 = and512(data, mask2);
+
+    u64a z = eq512mask(chars1, v1) & (eq512mask(chars2, v2) >> 1);
+
+    z &= mask;
+    if (unlikely(z)) {
+        u64a pos = ctz64(z);
+        return buf + pos;
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *dvermSearchAligned(m512 chars1, m512 chars2, u8 c1, u8 c2,
+                             const u8 *buf, const u8 *buf_end) {
+    for (; buf + 64 < buf_end; buf += 64) {
+        m512 data = load512(buf);
+        u64a z = eq512mask(chars1, data) & (eq512mask(chars2, data) >> 1);
+        if (buf[63] == c1 && buf[64] == c2) {
+            z |= (1ULL << 63);
+        }
+        if (unlikely(z)) {
+            u64a pos = ctz64(z);
+            return buf + pos;
+        }
+    }
+
+    return NULL;
+}
+
+static really_inline
+const u8 *dvermSearchAlignedNocase(m512 chars1, m512 chars2, u8 c1, u8 c2,
+                                   const u8 *buf, const u8 *buf_end) {
+    assert((size_t)buf % 64 == 0);
+    m512 casemask = set64x8(CASE_CLEAR);
+
+    for (; buf + 64 < buf_end; buf += 64) {
+        m512 data = load512(buf);
+        m512 v = and512(casemask, data);
+        u64a z = eq512mask(chars1, v) & (eq512mask(chars2, v) >> 1);
+        if ((buf[63] & CASE_CLEAR) == c1 && (buf[64] & CASE_CLEAR) == c2) {
+            z |= (1ULL << 63);
+        }
+        if (unlikely(z)) {
+            u64a pos = ctz64(z);
+            return buf + pos;
+        }
+    }
+
+    return NULL;
+}
+
+static really_inline
+const u8 *dvermSearchAlignedMasked(m512 chars1, m512 chars2,
+                                   m512 mask1, m512 mask2, u8 c1, u8 c2, u8 m1,
+                                   u8 m2, const u8 *buf, const u8 *buf_end) {
+    assert((size_t)buf % 64 == 0);
+
+    for (; buf + 64 < buf_end; buf += 64) {
+        m512 data = load512(buf);
+        m512 v1 = and512(data, mask1);
+        m512 v2 = and512(data, mask2);
+        u64a z = eq512mask(chars1, v1) & (eq512mask(chars2, v2) >> 1);
+
+        if ((buf[63] & m1) == c1 && (buf[64] & m2) == c2) {
+            z |= (1ULL << 63);
+        }
+        if (unlikely(z)) {
+            u64a pos = ctz64(z);
+            return buf + pos;
+        }
+    }
+
+    return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *dvermPrecondition(m512 chars1, m512 chars2, const u8 *buf) {
+    m512 data = loadu512(buf); // unaligned
+    u64a z = eq512mask(chars1, data) & (eq512mask(chars2, data) >> 1);
+
+    /* no fixup of the boundary required - the aligned run will pick it up */
+    if (unlikely(z)) {
+        u64a pos = ctz64(z);
+        return buf + pos;
+    }
+    return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *dvermPreconditionNocase(m512 chars1, m512 chars2, const u8 *buf) {
+    /* due to laziness, nonalphas and nocase having interesting behaviour */
+    m512 casemask = set64x8(CASE_CLEAR);
+    m512 data = loadu512(buf); // unaligned
+    m512 v = and512(casemask, data);
+    u64a z = eq512mask(chars1, v) & (eq512mask(chars2, v) >> 1);
+
+    /* no fixup of the boundary required - the aligned run will pick it up */
+    if (unlikely(z)) {
+        u64a pos = ctz64(z);
+        return buf + pos;
+    }
+    return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *dvermPreconditionMasked(m512 chars1, m512 chars2,
+                                  m512 mask1, m512 mask2, const u8 *buf) {
+    m512 data = loadu512(buf); // unaligned
+    m512 v1 = and512(data, mask1);
+    m512 v2 = and512(data, mask2);
+    u64a z = eq512mask(chars1, v1) & (eq512mask(chars2, v2) >> 1);
+
+    /* no fixup of the boundary required - the aligned run will pick it up */
+    if (unlikely(z)) {
+        u64a pos = ctz64(z);
+        return buf + pos;
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *lastMatchOffset(const u8 *buf_end, u64a z) {
+    assert(z);
+    return buf_end - 64 + 63 - clz64(z);
+}
+
+static really_inline
+const u8 *rvermMini(m512 chars, const u8 *buf, const u8 *buf_end, char negate) {
+    uintptr_t len = buf_end - buf;
+    __mmask64 mask = (~0ULL) >> (64 - len);
+    m512 data = loadu_maskz_m512(mask, buf);
+
+    u64a z = eq512mask(chars, data);
+
+    if (negate) {
+        z = ~z & mask;
+    }
+    z &= mask;
+    if (unlikely(z)) {
+        return lastMatchOffset(buf + 64, z);
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *rvermMiniNocase(m512 chars, const u8 *buf, const u8 *buf_end,
+                          char negate) {
+    uintptr_t len = buf_end - buf;
+    __mmask64 mask = (~0ULL) >> (64 - len);
+    m512 data = loadu_maskz_m512(mask, buf);
+    m512 casemask = set64x8(CASE_CLEAR);
+    m512 v = and512(casemask, data);
+
+    u64a z = eq512mask(chars, v);
+
+    if (negate) {
+        z = ~z & mask;
+    }
+    z &= mask;
+    if (unlikely(z)) {
+        return lastMatchOffset(buf + 64, z);
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *rvermSearchAligned(m512 chars, const u8 *buf, const u8 *buf_end,
+                             char negate) {
+    assert((size_t)buf_end % 64 == 0);
+    for (; buf + 63 < buf_end; buf_end -= 64) {
+        m512 data = load512(buf_end - 64);
+        u64a z = eq512mask(chars, data);
+        if (negate) {
+            z = ~z & ~0ULL;
+        }
+        if (unlikely(z)) {
+            return lastMatchOffset(buf_end, z);
+        }
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *rvermSearchAlignedNocase(m512 chars, const u8 *buf,
+                                   const u8 *buf_end, char negate) {
+    assert((size_t)buf_end % 64 == 0);
+    m512 casemask = set64x8(CASE_CLEAR);
+
+    for (; buf + 63 < buf_end; buf_end -= 64) {
+        m512 data = load512(buf_end - 64);
+        u64a z = eq512mask(chars, and512(casemask, data));
+        if (negate) {
+            z = ~z & ~0ULL;
+        }
+        if (unlikely(z)) {
+            return lastMatchOffset(buf_end, z);
+        }
+    }
+    return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *rvermUnalign(m512 chars, const u8 *buf, char negate) {
+    m512 data = loadu512(buf); // unaligned
+    u64a z = eq512mask(chars, data);
+    if (negate) {
+        z = ~z & ~0ULL;
+    }
+    if (unlikely(z)) {
+        return lastMatchOffset(buf + 64, z);
+    }
+    return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *rvermUnalignNocase(m512 chars, const u8 *buf, char negate) {
+    m512 casemask = set64x8(CASE_CLEAR);
+    m512 data = loadu512(buf); // unaligned
+    u64a z = eq512mask(chars, and512(casemask, data));
+    if (negate) {
+        z = ~z & ~0ULL;
+    }
+    if (unlikely(z)) {
+        return lastMatchOffset(buf + 64, z);
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *rdvermMini(m512 chars1, m512 chars2, const u8 *buf,
+                    const u8 *buf_end) {
+    uintptr_t len = buf_end - buf;
+    __mmask64 mask = (~0ULL) >> (64 - len);
+    m512 data = loadu_maskz_m512(mask, buf);
+
+    u64a z = eq512mask(chars2, data) & (eq512mask(chars1, data) << 1);
+
+    z &= mask;
+    if (unlikely(z)) {
+        return lastMatchOffset(buf + 64, z);
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *rdvermMiniNocase(m512 chars1, m512 chars2, const u8 *buf,
+                           const u8 *buf_end) {
+    uintptr_t len = buf_end - buf;
+    __mmask64 mask = (~0ULL) >> (64 - len);
+    m512 data = loadu_maskz_m512(mask, buf);
+    m512 casemask = set64x8(CASE_CLEAR);
+    m512 v = and512(casemask, data);
+
+    u64a z = eq512mask(chars2, v) & (eq512mask(chars1, v) << 1);
+
+    z &= mask;
+    if (unlikely(z)) {
+        return lastMatchOffset(buf + 64, z);
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *rdvermSearchAligned(m512 chars1, m512 chars2, u8 c1, u8 c2,
+                              const u8 *buf, const u8 *buf_end) {
+    assert((size_t)buf_end % 64 == 0);
+
+    for (; buf + 64 < buf_end; buf_end -= 64) {
+        m512 data = load512(buf_end - 64);
+        u64a z = eq512mask(chars2, data) & (eq512mask(chars1, data) << 1);
+        if (buf_end[-65] == c1 && buf_end[-64] == c2) {
+            z |= 1;
+        }
+        if (unlikely(z)) {
+            return lastMatchOffset(buf_end, z);
+        }
+    }
+    return buf_end;
+}
+
+static really_inline
+const u8 *rdvermSearchAlignedNocase(m512 chars1, m512 chars2, u8 c1, u8 c2,
+                                    const u8 *buf, const u8 *buf_end) {
+    assert((size_t)buf_end % 64 == 0);
+    m512 casemask = set64x8(CASE_CLEAR);
+
+    for (; buf + 64 < buf_end; buf_end -= 64) {
+        m512 data = load512(buf_end - 64);
+        m512 v = and512(casemask, data);
+        u64a z = eq512mask(chars2, v) & (eq512mask(chars1, v) << 1);
+        if ((buf_end[-65] & CASE_CLEAR) == c1
+            && (buf_end[-64] & CASE_CLEAR) == c2) {
+            z |= 1;
+        }
+        if (unlikely(z)) {
+            return lastMatchOffset(buf_end, z);
+        }
+    }
+    return buf_end;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *rdvermPrecondition(m512 chars1, m512 chars2, const u8 *buf) {
+    m512 data = loadu512(buf);
+    u64a z = eq512mask(chars2, data) & (eq512mask(chars1, data) << 1);
+
+    // no fixup of the boundary required - the aligned run will pick it up
+    if (unlikely(z)) {
+        return lastMatchOffset(buf + 64, z);
+    }
+
+    return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *rdvermPreconditionNocase(m512 chars1, m512 chars2, const u8 *buf) {
+    // due to laziness, nonalphas and nocase having interesting behaviour
+    m512 casemask = set64x8(CASE_CLEAR);
+    m512 data = loadu512(buf);
+    m512 v = and512(casemask, data);
+    u64a z = eq512mask(chars2, v) & (eq512mask(chars1, v) << 1);
+    // no fixup of the boundary required - the aligned run will pick it up
+    if (unlikely(z)) {
+        return lastMatchOffset(buf + 64, z);
+    }
+
+    return NULL;
+}
+
+#endif // HAVE_AVX512
diff --git a/regex/report.h b/regex/report.h
new file mode 100644
index 000000000..b35f4c052
--- /dev/null
+++ b/regex/report.h
@@ -0,0 +1,392 @@
+/*
+ * Copyright (c) 2016-2019, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Runtime functions to do with reports, inlined into callers.
+ */
+
+#ifndef REPORT_H
+#define REPORT_H
+
+#include "hs_internal.h"
+#include "hs_runtime.h"
+#include "scratch.h"
+#include "ue2common.h"
+#include "nfa/callback.h"
+#include "nfa/nfa_internal.h"
+#include "rose/runtime.h"
+#include "som/som_runtime.h"
+#include "util/exhaust.h"
+#include "util/logical.h"
+#include "util/fatbit.h"
+
+enum DedupeResult {
+    DEDUPE_CONTINUE, //!< Continue with match, not a dupe.
+    DEDUPE_SKIP, //!< Don't report this match, dupe or delayed due to SOM.
+    DEDUPE_HALT //!< User instructed us to stop matching.
+};
+
+static really_inline
+enum DedupeResult dedupeCatchup(const struct RoseEngine *rose,
+                                struct hs_scratch *scratch, u64a offset,
+                                u64a from_offset, u64a to_offset, u32 dkey,
+                                s32 offset_adjust, char is_external_report,
+                                char quash_som, const char do_som) {
+    DEBUG_PRINTF("offset=%llu, match=[%llu,%llu], dkey=%u, do_som=%d\n", offset,
+                 from_offset, to_offset, dkey, do_som);
+
+    // We should not have been called if there's no dedupe work to do.
+    assert(do_som || dkey != MO_INVALID_IDX);
+
+    struct match_deduper *deduper = &scratch->deduper;
+    if (offset != deduper->current_report_offset) {
+        assert(deduper->current_report_offset == ~0ULL ||
+               deduper->current_report_offset < offset);
+        if (offset == deduper->current_report_offset + 1) {
+            fatbit_clear(deduper->log[offset % 2]);
+        } else {
+            fatbit_clear(deduper->log[0]);
+            fatbit_clear(deduper->log[1]);
+        }
+
+        if (do_som && flushStoredSomMatches(scratch, offset)) {
+            return DEDUPE_HALT;
+        }
+        deduper->current_report_offset = offset;
+    }
+
+    if (dkey != MO_INVALID_IDX) {
+        const u32 dkeyCount = rose->dkeyCount;
+        if (is_external_report || quash_som) {
+            DEBUG_PRINTF("checking dkey %u at offset %llu\n", dkey, to_offset);
+            assert(offset_adjust == 0 || offset_adjust == -1);
+            if (fatbit_set(deduper->log[to_offset % 2], dkeyCount, dkey)) {
+                /* we have already raised this report at this offset, squash
+                 * dupe match. */
+                DEBUG_PRINTF("dedupe\n");
+                return DEDUPE_SKIP;
+            }
+        } else if (do_som) {
+            /* SOM external event */
+            DEBUG_PRINTF("checking dkey %u at offset %llu\n", dkey, to_offset);
+            assert(offset_adjust == 0 || offset_adjust == -1);
+            u64a *starts = deduper->som_start_log[to_offset % 2];
+            if (fatbit_set(deduper->som_log[to_offset % 2], dkeyCount, dkey)) {
+                starts[dkey] = MIN(starts[dkey], from_offset);
+            } else {
+                starts[dkey] = from_offset;
+            }
+            DEBUG_PRINTF("starts[%u]=%llu\n", dkey, starts[dkey]);
+
+            if (offset_adjust) {
+                deduper->som_log_dirty |= 1;
+            } else {
+                deduper->som_log_dirty |= 2;
+            }
+
+            return DEDUPE_SKIP;
+        }
+    }
+
+    return DEDUPE_CONTINUE;
+}
+
+/** \brief Test whether the given key (\a ekey) is set in the exhaustion vector
+ * \a evec. */
+static really_inline
+int isExhausted(const struct RoseEngine *rose, const char *evec, u32 ekey) {
+    DEBUG_PRINTF("checking exhaustion %p %u\n", evec, ekey);
+    assert(ekey != INVALID_EKEY);
+    assert(ekey < rose->ekeyCount);
+    return mmbit_isset((const u8 *)evec, rose->ekeyCount, ekey);
+}
+
+/** \brief Returns 1 if all exhaustion keys in the bitvector are on. */
+static really_inline
+int isAllExhausted(const struct RoseEngine *rose, const char *evec) {
+    if (!rose->canExhaust) {
+        return 0; /* pattern set is inexhaustible */
+    }
+
+    return mmbit_all((const u8 *)evec, rose->ekeyCount);
+}
+
+/** \brief Mark key \a ekey on in the exhaustion vector. */
+static really_inline
+void markAsMatched(const struct RoseEngine *rose, char *evec, u32 ekey) {
+    DEBUG_PRINTF("marking as exhausted key %u\n", ekey);
+    assert(ekey != INVALID_EKEY);
+    assert(ekey < rose->ekeyCount);
+    mmbit_set((u8 *)evec, rose->ekeyCount, ekey);
+}
+
+/** \brief Clear all keys in the exhaustion vector. */
+static really_inline
+void clearEvec(const struct RoseEngine *rose, char *evec) {
+    DEBUG_PRINTF("clearing evec %p %u\n", evec, rose->ekeyCount);
+    mmbit_clear((u8 *)evec, rose->ekeyCount);
+}
+
+/** \brief Test whether the given key (\a lkey) is set in the logical vector
+ * \a lvec. */
+static really_inline
+char getLogicalVal(const struct RoseEngine *rose, const char *lvec, u32 lkey) {
+    DEBUG_PRINTF("checking lkey matching %p %u\n", lvec, lkey);
+    assert(lkey != INVALID_LKEY);
+    assert(lkey < rose->lkeyCount + rose->lopCount);
+    return mmbit_isset((const u8 *)lvec, rose->lkeyCount + rose->lopCount,
+                       lkey);
+}
+
+/** \brief Mark key \a lkey on in the logical vector. */
+static really_inline
+void setLogicalVal(const struct RoseEngine *rose, char *lvec, u32 lkey,
+                   char val) {
+    DEBUG_PRINTF("marking as matched logical key %u\n", lkey);
+    assert(lkey != INVALID_LKEY);
+    assert(lkey < rose->lkeyCount + rose->lopCount);
+    switch (val) {
+    case 0:
+        mmbit_unset((u8 *)lvec, rose->lkeyCount + rose->lopCount, lkey);
+        break;
+    default:
+        mmbit_set((u8 *)lvec, rose->lkeyCount + rose->lopCount, lkey);
+        break;
+    }
+}
+
+/** \brief Mark key \a ckey on in the combination vector. */
+static really_inline
+void setCombinationActive(const struct RoseEngine *rose, char *cvec, u32 ckey) {
+    DEBUG_PRINTF("marking as active combination key %u\n", ckey);
+    assert(ckey != INVALID_CKEY);
+    assert(ckey < rose->ckeyCount);
+    mmbit_set((u8 *)cvec, rose->ckeyCount, ckey);
+}
+
+/** \brief Returns 1 if compliant to all logical combinations. */
+static really_inline
+char isLogicalCombination(const struct RoseEngine *rose, char *lvec,
+                          u32 start, u32 result) {
+    const struct LogicalOp *logicalTree = (const struct LogicalOp *)
+        ((const char *)rose + rose->logicalTreeOffset);
+    assert(start >= rose->lkeyCount);
+    assert(start <= result);
+    assert(result < rose->lkeyCount + rose->lopCount);
+    for (u32 i = start; i <= result; i++) {
+        const struct LogicalOp *op = logicalTree + (i - rose->lkeyCount);
+        assert(i == op->id);
+        assert(op->op <= LAST_LOGICAL_OP);
+        switch ((enum LogicalOpType)op->op) {
+        case LOGICAL_OP_NOT:
+            setLogicalVal(rose, lvec, op->id,
+                          !getLogicalVal(rose, lvec, op->ro));
+            break;
+        case LOGICAL_OP_AND:
+            setLogicalVal(rose, lvec, op->id,
+                          getLogicalVal(rose, lvec, op->lo) &
+                          getLogicalVal(rose, lvec, op->ro)); // &&
+            break;
+        case LOGICAL_OP_OR:
+            setLogicalVal(rose, lvec, op->id,
+                          getLogicalVal(rose, lvec, op->lo) |
+                          getLogicalVal(rose, lvec, op->ro)); // ||
+            break;
+        }
+    }
+    return getLogicalVal(rose, lvec, result);
+}
+
+/** \brief Returns 1 if combination matches when no sub-expression matches. */
+static really_inline
+char isPurelyNegativeMatch(const struct RoseEngine *rose, char *lvec,
+                           u32 start, u32 result) {
+    const struct LogicalOp *logicalTree = (const struct LogicalOp *)
+        ((const char *)rose + rose->logicalTreeOffset);
+    assert(start >= rose->lkeyCount);
+    assert(start <= result);
+    assert(result < rose->lkeyCount + rose->lopCount);
+    for (u32 i = start; i <= result; i++) {
+        const struct LogicalOp *op = logicalTree + (i - rose->lkeyCount);
+        assert(i == op->id);
+        assert(op->op <= LAST_LOGICAL_OP);
+        switch ((enum LogicalOpType)op->op) {
+        case LOGICAL_OP_NOT:
+            if ((op->ro < rose->lkeyCount) &&
+                getLogicalVal(rose, lvec, op->ro)) {
+                // sub-expression not negative
+                return 0;
+            }
+            setLogicalVal(rose, lvec, op->id,
+                          !getLogicalVal(rose, lvec, op->ro));
+            break;
+        case LOGICAL_OP_AND:
+            if (((op->lo < rose->lkeyCount) &&
+                 getLogicalVal(rose, lvec, op->lo)) ||
+                ((op->ro < rose->lkeyCount) &&
+                 getLogicalVal(rose, lvec, op->ro))) {
+                // sub-expression not negative
+                return 0;
+            }
+            setLogicalVal(rose, lvec, op->id,
+                          getLogicalVal(rose, lvec, op->lo) &
+                          getLogicalVal(rose, lvec, op->ro)); // &&
+            break;
+        case LOGICAL_OP_OR:
+            if (((op->lo < rose->lkeyCount) &&
+                 getLogicalVal(rose, lvec, op->lo)) ||
+                ((op->ro < rose->lkeyCount) &&
+                 getLogicalVal(rose, lvec, op->ro))) {
+                // sub-expression not negative
+                return 0;
+            }
+            setLogicalVal(rose, lvec, op->id,
+                          getLogicalVal(rose, lvec, op->lo) |
+                          getLogicalVal(rose, lvec, op->ro)); // ||
+            break;
+        }
+    }
+    return getLogicalVal(rose, lvec, result);
+}
+
+/** \brief Clear all keys in the logical vector. */
+static really_inline
+void clearLvec(const struct RoseEngine *rose, char *lvec, char *cvec) {
+    DEBUG_PRINTF("clearing lvec %p %u\n", lvec,
+                 rose->lkeyCount + rose->lopCount);
+    DEBUG_PRINTF("clearing cvec %p %u\n", cvec, rose->ckeyCount);
+    mmbit_clear((u8 *)lvec, rose->lkeyCount + rose->lopCount);
+    mmbit_clear((u8 *)cvec, rose->ckeyCount);
+}
+
+/** \brief Clear all keys in the combination vector. */
+static really_inline
+void clearCvec(const struct RoseEngine *rose, char *cvec) {
+    DEBUG_PRINTF("clearing cvec %p %u\n", cvec, rose->ckeyCount);
+    mmbit_clear((u8 *)cvec, rose->ckeyCount);
+}
+
+/**
+ * \brief Deliver the given report to the user callback.
+ *
+ * Assumes all preconditions (bounds, exhaustion etc) have been checked and
+ * that dedupe catchup has been done.
+ */
+static really_inline
+int roseDeliverReport(u64a offset, ReportID onmatch, s32 offset_adjust,
+                      struct hs_scratch *scratch, u32 ekey) {
+    assert(scratch);
+    assert(scratch->magic == SCRATCH_MAGIC);
+
+    struct core_info *ci = &scratch->core_info;
+
+    u32 flags = 0;
+#ifndef RELEASE_BUILD
+    if (offset_adjust) {
+        // alert testing tools that we've got adjusted matches
+        flags |= HS_MATCH_FLAG_ADJUSTED;
+    }
+#endif
+
+    assert(!can_stop_matching(scratch));
+    assert(ekey == INVALID_EKEY ||
+           !isExhausted(ci->rose, ci->exhaustionVector, ekey));
+
+    u64a from_offset = 0;
+    u64a to_offset = offset + offset_adjust;
+
+    DEBUG_PRINTF(">> reporting match @[%llu,%llu] for sig %u ctxt %p <<\n",
+                 from_offset, to_offset, onmatch, ci->userContext);
+
+    int halt = ci->userCallback(onmatch, from_offset, to_offset, flags,
+                                ci->userContext);
+    if (halt) {
+        DEBUG_PRINTF("callback requested to terminate matches\n");
+        ci->status |= STATUS_TERMINATED;
+        return MO_HALT_MATCHING;
+    }
+
+    if (ekey != INVALID_EKEY) {
+        markAsMatched(ci->rose, ci->exhaustionVector, ekey);
+        return MO_CONTINUE_MATCHING;
+    } else {
+        return ROSE_CONTINUE_MATCHING_NO_EXHAUST;
+    }
+}
+
+/**
+ * \brief Deliver the given SOM report to the user callback.
+ *
+ * Assumes all preconditions (bounds, exhaustion etc) have been checked and
+ * that dedupe catchup has been done.
+ */
+static really_inline
+int roseDeliverSomReport(u64a from_offset, u64a to_offset, ReportID onmatch,
+                         s32 offset_adjust, struct hs_scratch *scratch,
+                         u32 ekey) {
+    assert(scratch);
+    assert(scratch->magic == SCRATCH_MAGIC);
+
+    struct core_info *ci = &scratch->core_info;
+
+    u32 flags = 0;
+#ifndef RELEASE_BUILD
+    if (offset_adjust) {
+        // alert testing tools that we've got adjusted matches
+        flags |= HS_MATCH_FLAG_ADJUSTED;
+    }
+#endif
+
+    assert(!can_stop_matching(scratch));
+    assert(ekey == INVALID_EKEY ||
+           !isExhausted(ci->rose, ci->exhaustionVector, ekey));
+
+    to_offset += offset_adjust;
+    assert(from_offset == HS_OFFSET_PAST_HORIZON || from_offset <= to_offset);
+
+    DEBUG_PRINTF(">> reporting match @[%llu,%llu] for sig %u ctxt %p <<\n",
+                 from_offset, to_offset, onmatch, ci->userContext);
+
+    int halt = ci->userCallback(onmatch, from_offset, to_offset, flags,
+                                ci->userContext);
+
+    if (halt) {
+        DEBUG_PRINTF("callback requested to terminate matches\n");
+        ci->status |= STATUS_TERMINATED;
+        return MO_HALT_MATCHING;
+    }
+
+    if (ekey != INVALID_EKEY) {
+        markAsMatched(ci->rose, ci->exhaustionVector, ekey);
+        return MO_CONTINUE_MATCHING;
+    } else {
+        return ROSE_CONTINUE_MATCHING_NO_EXHAUST;
+    }
+}
+
+#endif // REPORT_H
diff --git a/regex/rose/block.c b/regex/rose/block.c
new file mode 100644
index 000000000..b3f424cb7
--- /dev/null
+++ b/regex/rose/block.c
@@ -0,0 +1,422 @@
+/*
+ * Copyright (c) 2015-2019, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "catchup.h"
+#include "init.h"
+#include "match.h"
+#include "program_runtime.h"
+#include "rose.h"
+#include "rose_common.h"
+#include "nfa/nfa_api.h"
+#include "nfa/nfa_internal.h"
+#include "nfa/nfa_rev_api.h"
+#include "nfa/mcclellan.h"
+#include "util/fatbit.h"
+
+static rose_inline
+void runAnchoredTableBlock(const struct RoseEngine *t, const void *atable,
+                           struct hs_scratch *scratch) {
+    const u8 *buffer = scratch->core_info.buf;
+    size_t length = scratch->core_info.len;
+    size_t alen = MIN(length, t->anchoredDistance);
+    const struct anchored_matcher_info *curr = atable;
+
+    DEBUG_PRINTF("BEGIN ANCHORED (over %zu/%zu)\n", alen, length);
+
+    do {
+        const struct NFA *nfa
+            = (const struct NFA *)((const char *)curr + sizeof(*curr));
+
+        assert(t->anchoredDistance > curr->anchoredMinDistance);
+        if (length >= curr->anchoredMinDistance) {
+            size_t local_alen = alen - curr->anchoredMinDistance;
+            const u8 *local_buffer = buffer + curr->anchoredMinDistance;
+
+            DEBUG_PRINTF("--anchored nfa (+%u)\n", curr->anchoredMinDistance);
+            assert(isMcClellanType(nfa->type));
+            if (nfa->type == MCCLELLAN_NFA_8) {
+                nfaExecMcClellan8_B(nfa, curr->anchoredMinDistance,
+                                    local_buffer, local_alen,
+                                    roseAnchoredCallback, scratch);
+            } else {
+                nfaExecMcClellan16_B(nfa, curr->anchoredMinDistance,
+                                     local_buffer, local_alen,
+                                     roseAnchoredCallback, scratch);
+            }
+        }
+
+        if (!curr->next_offset) {
+            break;
+        }
+
+        curr = (const void *)((const char *)curr + curr->next_offset);
+    } while (1);
+}
+
+static really_inline
+void init_state_for_block(const struct RoseEngine *t, char *state) {
+    assert(t);
+    assert(state);
+
+    DEBUG_PRINTF("init for Rose %p with %u state indices\n", t,
+                 t->rolesWithStateCount);
+
+    // Rose is guaranteed 8-aligned state
+    assert(ISALIGNED_N(state, 8));
+
+    init_state(t, state);
+}
+
+static really_inline
+void init_outfixes_for_block(const struct RoseEngine *t,
+                             struct hs_scratch *scratch, char *state,
+                             char is_small_block) {
+    /* active leaf array has been cleared by the init scatter */
+
+    if (t->initMpvNfa != MO_INVALID_IDX) {
+        assert(t->initMpvNfa == 0);
+        const struct NFA *nfa = getNfaByQueue(t, 0);
+        DEBUG_PRINTF("testing minwidth %u > len %zu\n", nfa->minWidth,
+                      scratch->core_info.len);
+        size_t len = nfaRevAccelCheck(nfa, scratch->core_info.buf,
+                                      scratch->core_info.len);
+        if (len) {
+            u8 *activeArray = getActiveLeafArray(t, state);
+            const u32 activeArraySize = t->activeArrayCount;
+            const u32 qCount = t->queueCount;
+
+            mmbit_set(activeArray, activeArraySize, 0);
+            fatbit_set(scratch->aqa, qCount, 0);
+
+            struct mq *q = scratch->queues;
+            initQueue(q, 0, t, scratch);
+            q->length = len; /* adjust for rev_accel */
+            nfaQueueInitState(nfa, q);
+            pushQueueAt(q, 0, MQE_START, 0);
+            pushQueueAt(q, 1, MQE_TOP, 0);
+        }
+    }
+
+    if (is_small_block && !t->hasOutfixesInSmallBlock) {
+        DEBUG_PRINTF("all outfixes in small block table\n");
+        return;
+    }
+
+    if (t->outfixBeginQueue != t->outfixEndQueue) {
+        blockInitSufPQ(t, state, scratch, is_small_block);
+    }
+}
+
+static really_inline
+void init_for_block(const struct RoseEngine *t, struct hs_scratch *scratch,
+                    char *state, char is_small_block) {
+    init_state_for_block(t, state);
+
+    struct RoseContext *tctxt = &scratch->tctxt;
+
+    tctxt->groups = t->initialGroups;
+    tctxt->lit_offset_adjust = 1; // index after last byte
+    tctxt->delayLastEndOffset = 0;
+    tctxt->lastEndOffset = 0;
+    tctxt->filledDelayedSlots = 0;
+    tctxt->lastMatchOffset = 0;
+    tctxt->lastCombMatchOffset = 0;
+    tctxt->minMatchOffset = 0;
+    tctxt->minNonMpvMatchOffset = 0;
+    tctxt->next_mpv_offset = 0;
+
+    scratch->al_log_sum = 0;
+
+    fatbit_clear(scratch->aqa);
+
+    scratch->catchup_pq.qm_size = 0;
+
+    init_outfixes_for_block(t, scratch, state, is_small_block);
+}
+
+static rose_inline
+void roseBlockEodExec(const struct RoseEngine *t, u64a offset,
+                      struct hs_scratch *scratch) {
+    assert(t->requiresEodCheck);
+    assert(t->maxBiAnchoredWidth == ROSE_BOUND_INF
+           || offset <= t->maxBiAnchoredWidth);
+
+    assert(!can_stop_matching(scratch));
+    assert(t->eodProgramOffset);
+
+    // Ensure that history is correct before we look for EOD matches.
+    roseFlushLastByteHistory(t, scratch, offset);
+    scratch->tctxt.lastEndOffset = offset;
+
+    DEBUG_PRINTF("running eod program at %u\n", t->eodProgramOffset);
+
+    // There should be no pending delayed literals.
+    assert(!scratch->tctxt.filledDelayedSlots);
+
+    const u64a som = 0;
+    const u8 flags = ROSE_PROG_FLAG_SKIP_MPV_CATCHUP;
+
+    // Note: we ignore the result, as this is the last thing to ever happen on
+    // a scan.
+    roseRunProgram(t, scratch, t->eodProgramOffset, som, offset, flags);
+}
+
+/**
+ * \brief Run the anchored matcher, if any. Returns non-zero if matching should
+ * halt.
+ */
+static rose_inline
+int roseBlockAnchored(const struct RoseEngine *t, struct hs_scratch *scratch) {
+    const void *atable = getALiteralMatcher(t);
+    if (!atable) {
+        DEBUG_PRINTF("no anchored table\n");
+        return 0;
+    }
+
+    const size_t length = scratch->core_info.len;
+
+    if (t->amatcherMaxBiAnchoredWidth != ROSE_BOUND_INF &&
+        length > t->amatcherMaxBiAnchoredWidth) {
+        return 0;
+    }
+
+    if (length < t->amatcherMinWidth) {
+        return 0;
+    }
+
+    runAnchoredTableBlock(t, atable, scratch);
+
+    return can_stop_matching(scratch);
+}
+
+/**
+ * \brief Run the floating matcher, if any. Returns non-zero if matching should
+ * halt.
+ */
+static rose_inline
+int roseBlockFloating(const struct RoseEngine *t, struct hs_scratch *scratch) {
+    const struct HWLM *ftable = getFLiteralMatcher(t);
+    if (!ftable) {
+        return 0;
+    }
+
+    const size_t length = scratch->core_info.len;
+    char *state = scratch->core_info.state;
+    struct RoseContext *tctxt = &scratch->tctxt;
+
+    DEBUG_PRINTF("ftable fd=%u fmd %u\n", t->floatingDistance,
+                 t->floatingMinDistance);
+    if (t->noFloatingRoots && !roseHasInFlightMatches(t, state, scratch)) {
+        DEBUG_PRINTF("skip FLOATING: no inflight matches\n");
+        return 0;
+    }
+
+    if (t->fmatcherMaxBiAnchoredWidth != ROSE_BOUND_INF &&
+        length > t->fmatcherMaxBiAnchoredWidth) {
+        return 0;
+    }
+
+    if (length < t->fmatcherMinWidth) {
+        return 0;
+    }
+
+    const u8 *buffer = scratch->core_info.buf;
+    size_t flen = length;
+    if (t->floatingDistance != ROSE_BOUND_INF) {
+        flen = MIN(t->floatingDistance, length);
+    }
+    if (flen <= t->floatingMinDistance) {
+        return 0;
+    }
+
+    DEBUG_PRINTF("BEGIN FLOATING (over %zu/%zu)\n", flen, length);
+    DEBUG_PRINTF("-- %016llx\n", tctxt->groups);
+    hwlmExec(ftable, buffer, flen, t->floatingMinDistance, roseFloatingCallback,
+             scratch, tctxt->groups & t->floating_group_mask);
+
+    return can_stop_matching(scratch);
+}
+
+static rose_inline
+void runEagerPrefixesBlock(const struct RoseEngine *t,
+                           struct hs_scratch *scratch) {
+    if (!t->eagerIterOffset) {
+        return;
+    }
+
+    char *state = scratch->core_info.state;
+    u8 *ara = getActiveLeftArray(t, state); /* indexed by offsets into
+                                             * left_table */
+    const u32 arCount = t->activeLeftCount;
+    const u32 qCount = t->queueCount;
+    const struct LeftNfaInfo *left_table = getLeftTable(t);
+    const struct mmbit_sparse_iter *it = getByOffset(t, t->eagerIterOffset);
+
+    struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES];
+
+    u32 idx = 0;
+    u32 ri = mmbit_sparse_iter_begin(ara, arCount, &idx, it, si_state);
+    for (; ri != MMB_INVALID;
+           ri = mmbit_sparse_iter_next(ara, arCount, ri, &idx, it, si_state)) {
+        const struct LeftNfaInfo *left = left_table + ri;
+        u32 qi = ri + t->leftfixBeginQueue;
+        DEBUG_PRINTF("leftfix %u/%u, maxLag=%u\n", ri, arCount, left->maxLag);
+
+        assert(!fatbit_isset(scratch->aqa, qCount, qi));
+        assert(left->eager);
+        assert(!left->infix);
+
+        struct mq *q = scratch->queues + qi;
+        const struct NFA *nfa = getNfaByQueue(t, qi);
+
+        if (scratch->core_info.len < nfa->minWidth) {
+            /* we know that there is not enough data for this to ever match, so
+             * we can immediately squash/ */
+            mmbit_unset(ara, arCount, ri);
+            scratch->tctxt.groups &= left->squash_mask;
+        }
+
+        s64a loc = MIN(scratch->core_info.len, EAGER_STOP_OFFSET);
+
+        fatbit_set(scratch->aqa, qCount, qi);
+        initRoseQueue(t, qi, left, scratch);
+
+        pushQueueAt(q, 0, MQE_START, 0);
+        pushQueueAt(q, 1, MQE_TOP, 0);
+        pushQueueAt(q, 2, MQE_END, loc);
+        nfaQueueInitState(nfa, q);
+
+        char alive = nfaQueueExecToMatch(q->nfa, q, loc);
+
+        if (!alive) {
+            DEBUG_PRINTF("queue %u dead, squashing\n", qi);
+            mmbit_unset(ara, arCount, ri);
+            fatbit_unset(scratch->aqa, qCount, qi);
+            scratch->tctxt.groups &= left->squash_mask;
+        } else if (q->cur == q->end) {
+            assert(alive != MO_MATCHES_PENDING);
+            if (loc == (s64a)scratch->core_info.len) {
+                /* We know that the prefix does not match in the block so we
+                 * can squash the groups anyway even though it did not die */
+                /* TODO: if we knew the minimum lag the leftfix is checked at we
+                 * could make this check tighter */
+                DEBUG_PRINTF("queue %u has no match in block, squashing\n", qi);
+                mmbit_unset(ara, arCount, ri);
+                fatbit_unset(scratch->aqa, qCount, qi);
+                scratch->tctxt.groups &= left->squash_mask;
+            } else {
+                DEBUG_PRINTF("queue %u finished, nfa lives\n", qi);
+                q->cur = q->end = 0;
+                pushQueueAt(q, 0, MQE_START, loc);
+            }
+        } else {
+            assert(alive == MO_MATCHES_PENDING);
+            DEBUG_PRINTF("queue %u unfinished, nfa lives\n", qi);
+            q->end--; /* remove end item */
+        }
+    }
+}
+
+void roseBlockExec(const struct RoseEngine *t, struct hs_scratch *scratch) {
+    assert(t);
+    assert(scratch);
+    assert(scratch->core_info.buf);
+    assert(mmbit_sparse_iter_state_size(t->rolesWithStateCount)
+           < MAX_SPARSE_ITER_STATES);
+
+    // We should not have been called if we've already been told to terminate
+    // matching.
+    assert(!told_to_stop_matching(scratch));
+
+    // If this block is shorter than our minimum width, then no pattern in this
+    // RoseEngine could match.
+    /* minWidth checks should have already been performed by the caller */
+    assert(scratch->core_info.len >= t->minWidth);
+
+    // Similarly, we may have a maximum width (for engines constructed entirely
+    // of bi-anchored patterns).
+    /* This check is now handled by the interpreter */
+    assert(t->maxBiAnchoredWidth == ROSE_BOUND_INF
+           || scratch->core_info.len <= t->maxBiAnchoredWidth);
+
+    const size_t length = scratch->core_info.len;
+
+    // We have optimizations for small block scans: we run a single coalesced
+    // HWLM scan instead of running the anchored and floating matchers. Some
+    // outfixes are disabled as well (for SEP scans of single-byte literals,
+    // which are also run in the HWLM scan).
+    const char is_small_block =
+        (length < ROSE_SMALL_BLOCK_LEN && t->sbmatcherOffset);
+
+    char *state = scratch->core_info.state;
+
+    init_for_block(t, scratch, state, is_small_block);
+
+    struct RoseContext *tctxt = &scratch->tctxt;
+
+    if (is_small_block) {
+        const void *sbtable = getSBLiteralMatcher(t);
+        assert(sbtable);
+
+        size_t sblen = MIN(length, t->smallBlockDistance);
+
+        DEBUG_PRINTF("BEGIN SMALL BLOCK (over %zu/%zu)\n", sblen, length);
+        DEBUG_PRINTF("-- %016llx\n", tctxt->groups);
+        hwlmExec(sbtable, scratch->core_info.buf, sblen, 0, roseCallback,
+                 scratch, tctxt->groups);
+    } else {
+        runEagerPrefixesBlock(t, scratch);
+
+        if (roseBlockAnchored(t, scratch)) {
+            return;
+        }
+        if (roseBlockFloating(t, scratch)) {
+            return;
+        }
+    }
+
+    if (cleanUpDelayed(t, scratch, length, 0) == HWLM_TERMINATE_MATCHING) {
+        return;
+    }
+
+    assert(!can_stop_matching(scratch));
+
+    roseCatchUpTo(t, scratch, length);
+
+    if (!t->requiresEodCheck || !t->eodProgramOffset) {
+        DEBUG_PRINTF("no eod check required\n");
+        return;
+    }
+
+    if (can_stop_matching(scratch)) {
+        DEBUG_PRINTF("bailing, already halted\n");
+        return;
+    }
+
+    roseBlockEodExec(t, length, scratch);
+}
diff --git a/regex/rose/catchup.c b/regex/rose/catchup.c
new file mode 100644
index 000000000..7a6648da9
--- /dev/null
+++ b/regex/rose/catchup.c
@@ -0,0 +1,900 @@
+/*
+ * Copyright (c) 2015-2018, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file
+ * \brief Rose runtime: code for catching up output-exposed engines.
+ */
+
+#include "catchup.h"
+#include "match.h"
+#include "program_runtime.h"
+#include "rose.h"
+#include "nfa/nfa_rev_api.h"
+#include "nfa/mpv.h"
+#include "som/som_runtime.h"
+#include "util/fatbit.h"
+#include "report.h"
+
+typedef struct queue_match PQ_T;
+#define PQ_COMP(pqc_items, a, b) ((pqc_items)[a].loc < (pqc_items)[b].loc)
+#define PQ_COMP_B(pqc_items, a, b_fixed) ((pqc_items)[a].loc < (b_fixed).loc)
+
+#include "util/pqueue.h"
+
+static really_inline
+int roseNfaRunProgram(const struct RoseEngine *rose, struct hs_scratch *scratch,
+                      u64a som, u64a offset, ReportID id, const char from_mpv) {
+    const u32 program = id;
+    u8 flags = ROSE_PROG_FLAG_IN_CATCHUP;
+    if (from_mpv) {
+        flags |= ROSE_PROG_FLAG_FROM_MPV;
+    }
+
+    roseRunProgram(rose, scratch, program, som, offset, flags);
+
+    return can_stop_matching(scratch) ? MO_HALT_MATCHING : MO_CONTINUE_MATCHING;
+}
+
+static rose_inline
+char roseSuffixInfoIsExhausted(const struct RoseEngine *rose,
+                               const struct NfaInfo *info,
+                               const char *exhausted) {
+    if (!info->ekeyListOffset) {
+        return 0;
+    }
+
+    DEBUG_PRINTF("check exhaustion -> start at %u\n", info->ekeyListOffset);
+
+    /* INVALID_EKEY terminated list */
+    const u32 *ekeys = getByOffset(rose, info->ekeyListOffset);
+    while (*ekeys != INVALID_EKEY) {
+        DEBUG_PRINTF("check %u\n", *ekeys);
+        if (!isExhausted(rose, exhausted, *ekeys)) {
+            DEBUG_PRINTF("not exhausted -> alive\n");
+            return 0;
+        }
+        ++ekeys;
+    }
+
+    DEBUG_PRINTF("all ekeys exhausted -> dead\n");
+    return 1;
+}
+
+static really_inline
+char roseSuffixIsExhausted(const struct RoseEngine *rose, u32 qi,
+                           const char *exhausted) {
+    DEBUG_PRINTF("check queue %u\n", qi);
+    const struct NfaInfo *info = getNfaInfoByQueue(rose, qi);
+    return roseSuffixInfoIsExhausted(rose, info, exhausted);
+}
+
+static really_inline
+void deactivateQueue(const struct RoseEngine *t, u8 *aa, u32 qi,
+                     struct hs_scratch *scratch) {
+    u32 aaCount = t->activeArrayCount;
+    u32 qCount = t->queueCount;
+
+    /* this is sailing close to the wind with regards to invalidating an
+     * iteration. We are saved by the fact that unsetting does not clear the
+     * summary bits -> the block under the gun remains valid
+     */
+    DEBUG_PRINTF("killing off zombie queue %u\n", qi);
+    mmbit_unset(aa, aaCount, qi);
+    fatbit_unset(scratch->aqa, qCount, qi);
+}
+
+static really_inline
+void ensureQueueActive(const struct RoseEngine *t, u32 qi, u32 qCount,
+                       struct mq *q, struct hs_scratch *scratch) {
+    if (!fatbit_set(scratch->aqa, qCount, qi)) {
+        DEBUG_PRINTF("initing %u\n", qi);
+        initQueue(q, qi, t, scratch);
+        loadStreamState(q->nfa, q, 0);
+        pushQueueAt(q, 0, MQE_START, 0);
+    }
+}
+
+static really_inline
+void pq_replace_top_with(struct catchup_pq *pq,
+                         UNUSED struct hs_scratch *scratch, u32 queue,
+                         s64a loc) {
+    DEBUG_PRINTF("inserting q%u in pq at %lld\n", queue, loc);
+    struct queue_match temp = {
+        .queue = queue,
+        .loc = (size_t)loc
+    };
+
+    assert(loc > 0);
+    assert(pq->qm_size);
+    assert(loc <= (s64a)scratch->core_info.len);
+    pq_replace_top(pq->qm, pq->qm_size, temp);
+}
+
+static really_inline
+void pq_insert_with(struct catchup_pq *pq,
+                    UNUSED struct hs_scratch *scratch, u32 queue, s64a loc) {
+    DEBUG_PRINTF("inserting q%u in pq at %lld\n", queue, loc);
+    struct queue_match temp = {
+        .queue = queue,
+        .loc = (size_t)loc
+    };
+
+    assert(loc > 0);
+    assert(loc <= (s64a)scratch->core_info.len);
+    pq_insert(pq->qm, pq->qm_size, temp);
+    ++pq->qm_size;
+}
+
+static really_inline
+void pq_pop_nice(struct catchup_pq *pq) {
+    pq_pop(pq->qm, pq->qm_size);
+    pq->qm_size--;
+}
+
+static really_inline
+s64a pq_top_loc(struct catchup_pq *pq) {
+    assert(pq->qm_size);
+    return (s64a)pq_top(pq->qm)->loc;
+}
+
+/* requires that we are the top item on the pq */
+static really_inline
+hwlmcb_rv_t runExistingNfaToNextMatch(const struct RoseEngine *t, u32 qi,
+                                      struct mq *q, s64a loc,
+                                      struct hs_scratch *scratch, u8 *aa,
+                                      char report_curr) {
+    assert(pq_top(scratch->catchup_pq.qm)->queue == qi);
+    assert(scratch->catchup_pq.qm_size);
+    assert(!q->report_current);
+    if (report_curr) {
+        DEBUG_PRINTF("need to report matches\n");
+        q->report_current = 1;
+    }
+
+    DEBUG_PRINTF("running queue from %u:%lld to %lld\n", q->cur, q_cur_loc(q),
+                 loc);
+
+    assert(q_cur_loc(q) <= loc);
+
+    char alive = nfaQueueExecToMatch(q->nfa, q, loc);
+
+    /* exit via gift shop */
+    if (alive == MO_MATCHES_PENDING) {
+        /* we have pending matches */
+        assert(q_cur_loc(q) + scratch->core_info.buf_offset
+               >= scratch->tctxt.minMatchOffset);
+        pq_replace_top_with(&scratch->catchup_pq, scratch, qi, q_cur_loc(q));
+        return HWLM_CONTINUE_MATCHING;
+    } else if (!alive) {
+        if (report_curr && can_stop_matching(scratch)) {
+            DEBUG_PRINTF("bailing\n");
+            return HWLM_TERMINATE_MATCHING;
+        }
+
+        deactivateQueue(t, aa, qi, scratch);
+    } else if (q->cur == q->end) {
+        DEBUG_PRINTF("queue %u finished, nfa lives\n", qi);
+        q->cur = q->end = 0;
+        pushQueueAt(q, 0, MQE_START, loc);
+    } else {
+        DEBUG_PRINTF("queue %u unfinished, nfa lives\n", qi);
+        u32 i = 0;
+        while (q->cur < q->end) {
+            q->items[i] = q->items[q->cur++];
+            DEBUG_PRINTF("q[%u] = %u:%lld\n", i, q->items[i].type,
+                         q->items[i].location);
+            assert(q->items[i].type != MQE_END);
+            i++;
+        }
+        q->cur = 0;
+        q->end = i;
+    }
+
+    pq_pop_nice(&scratch->catchup_pq);
+
+    return HWLM_CONTINUE_MATCHING;
+}
+
+static really_inline
+hwlmcb_rv_t runNewNfaToNextMatch(const struct RoseEngine *t, u32 qi,
+                                 struct mq *q, s64a loc,
+                                 struct hs_scratch *scratch, u8 *aa,
+                                 s64a report_ok_loc) {
+    assert(!q->report_current);
+    DEBUG_PRINTF("running queue from %u:%lld to %lld\n", q->cur, q_cur_loc(q),
+                 loc);
+    DEBUG_PRINTF("min match offset %llu\n", scratch->tctxt.minMatchOffset);
+
+    char alive = 1;
+
+restart:
+    alive = nfaQueueExecToMatch(q->nfa, q, loc);
+
+    if (alive == MO_MATCHES_PENDING) {
+        DEBUG_PRINTF("we have pending matches at %lld\n", q_cur_loc(q));
+        s64a qcl = q_cur_loc(q);
+
+        if (qcl == report_ok_loc) {
+            assert(q->cur != q->end); /* the queue shouldn't be empty if there
+                                       * are pending matches. */
+            q->report_current = 1;
+            DEBUG_PRINTF("restarting...\n");
+            goto restart;
+        }
+        assert(qcl + scratch->core_info.buf_offset
+               >= scratch->tctxt.minMatchOffset);
+        pq_insert_with(&scratch->catchup_pq, scratch, qi, qcl);
+    } else if (!alive) {
+        if (can_stop_matching(scratch)) {
+            DEBUG_PRINTF("bailing\n");
+            return HWLM_TERMINATE_MATCHING;
+        }
+
+        deactivateQueue(t, aa, qi, scratch);
+    } else if (q->cur == q->end) {
+        DEBUG_PRINTF("queue %u finished, nfa lives\n", qi);
+        q->cur = q->end = 0;
+        pushQueueAt(q, 0, MQE_START, loc);
+    } else {
+        DEBUG_PRINTF("queue %u unfinished, nfa lives\n", qi);
+        u32 i = 0;
+        while (q->cur < q->end) {
+            q->items[i] = q->items[q->cur++];
+            DEBUG_PRINTF("q[%u] = %u:%lld\n", i, q->items[i].type,
+                         q->items[i].location);
+            assert(q->items[i].type != MQE_END);
+            i++;
+        }
+        q->cur = 0;
+        q->end = i;
+    }
+
+    return HWLM_CONTINUE_MATCHING;
+}
+
+/* for use by mpv (chained) only */
+static
+int roseNfaFinalBlastAdaptor(u64a start, u64a end, ReportID id, void *context) {
+    struct hs_scratch *scratch = context;
+    assert(scratch && scratch->magic == SCRATCH_MAGIC);
+    const struct RoseEngine *t = scratch->core_info.rose;
+
+    DEBUG_PRINTF("id=%u matched at [%llu,%llu]\n", id, start, end);
+
+    int cb_rv = roseNfaRunProgram(t, scratch, start, end, id, 1);
+    if (cb_rv == MO_HALT_MATCHING) {
+        return MO_HALT_MATCHING;
+    } else if (cb_rv == ROSE_CONTINUE_MATCHING_NO_EXHAUST) {
+        return MO_CONTINUE_MATCHING;
+    } else {
+        assert(cb_rv == MO_CONTINUE_MATCHING);
+        return !roseSuffixIsExhausted(t, 0,
+                                      scratch->core_info.exhaustionVector);
+    }
+}
+
+static really_inline
+void ensureEnd(struct mq *q, UNUSED u32 qi, s64a final_loc) {
+    DEBUG_PRINTF("ensure MQE_END %lld for queue %u\n", final_loc, qi);
+    if (final_loc >= q_last_loc(q)) {
+        /* TODO: ensure situation does not arise */
+        assert(q_last_type(q) != MQE_END);
+        pushQueueNoMerge(q, MQE_END, final_loc);
+    }
+}
+
+static really_inline
+hwlmcb_rv_t add_to_queue(const struct RoseEngine *t, struct mq *queues,
+                         u32 qCount, u8 *aa, struct hs_scratch *scratch,
+                         s64a loc, u32 qi, s64a report_ok_loc) {
+    struct mq *q = queues + qi;
+    const struct NfaInfo *info = getNfaInfoByQueue(t, qi);
+
+    if (roseSuffixInfoIsExhausted(t, info,
+                                  scratch->core_info.exhaustionVector)) {
+        deactivateQueue(t, aa, qi, scratch);
+        return HWLM_CONTINUE_MATCHING;
+    }
+
+    ensureQueueActive(t, qi, qCount, q, scratch);
+
+    if (unlikely(loc < q_cur_loc(q))) {
+        DEBUG_PRINTF("err loc %lld < location %lld\n", loc, q_cur_loc(q));
+        return HWLM_CONTINUE_MATCHING;
+    }
+
+    ensureEnd(q, qi, loc);
+
+    return runNewNfaToNextMatch(t, qi, q, loc, scratch, aa, report_ok_loc);
+}
+
+static really_inline
+s64a findSecondPlace(struct catchup_pq *pq, s64a loc_limit) {
+    assert(pq->qm_size); /* we are still on the pq and we are first place */
+
+    /* we know (*cough* encapsulation) that second place will either be in
+     * pq->qm[1] or pq->qm[2] (we are pq->qm[0]) */
+    switch (pq->qm_size) {
+    case 0:
+    case 1:
+        return (s64a)loc_limit;
+    case 2:
+        return MIN((s64a)pq->qm[1].loc, loc_limit);
+    default:;
+        size_t best = MIN(pq->qm[1].loc, pq->qm[2].loc);
+        return MIN((s64a)best, loc_limit);
+    }
+}
+
+hwlmcb_rv_t roseCatchUpMPV_i(const struct RoseEngine *t, s64a loc,
+                             struct hs_scratch *scratch) {
+    char *state = scratch->core_info.state;
+    struct mq *queues = scratch->queues;
+    u8 *aa = getActiveLeafArray(t, state);
+    UNUSED u32 aaCount = t->activeArrayCount;
+    u32 qCount = t->queueCount;
+
+    /* find first match of each pending nfa */
+    DEBUG_PRINTF("aa=%p, aaCount=%u\n", aa, aaCount);
+
+    assert(t->outfixBeginQueue == 1);
+
+    u32 qi = 0;
+    assert(mmbit_isset(aa, aaCount, 0)); /* caller should have already bailed */
+
+    DEBUG_PRINTF("catching up qi=%u to loc %lld\n", qi, loc);
+
+    struct mq *q = queues + qi;
+    const struct NfaInfo *info = getNfaInfoByQueue(t, qi);
+    u64a mpv_exec_end = scratch->core_info.buf_offset + loc;
+    u64a next_pos_match_loc = 0;
+
+    if (roseSuffixInfoIsExhausted(t, info,
+                                  scratch->core_info.exhaustionVector)) {
+        deactivateQueue(t, aa, qi, scratch);
+        goto done;
+    }
+
+    ensureQueueActive(t, qi, qCount, q, scratch);
+
+    if (unlikely(loc < q_cur_loc(q))) {
+        DEBUG_PRINTF("err loc %lld < location %lld\n", loc, q_cur_loc(q));
+        goto done;
+    }
+
+    ensureEnd(q, qi, loc);
+
+    assert(!q->report_current);
+
+    q->cb = roseNfaFinalBlastAdaptor;
+
+    DEBUG_PRINTF("queue %u blasting, %u/%u [%lld/%lld]\n",
+                  qi, q->cur, q->end, q->items[q->cur].location, loc);
+
+    scratch->tctxt.mpv_inactive = 0;
+
+    /* we know it is going to be an mpv, skip the indirection */
+    next_pos_match_loc = nfaExecMpv_QueueExecRaw(q->nfa, q, loc);
+    assert(!q->report_current);
+
+    if (!next_pos_match_loc) { /* 0 means dead */
+        DEBUG_PRINTF("mpv is pining for the fjords\n");
+        if (can_stop_matching(scratch)) {
+            deactivateQueue(t, aa, qi, scratch);
+            return HWLM_TERMINATE_MATCHING;
+        }
+
+        next_pos_match_loc = scratch->core_info.len;
+        scratch->tctxt.mpv_inactive = 1;
+    }
+
+    if (q->cur == q->end) {
+        DEBUG_PRINTF("queue %u finished, nfa lives [%lld]\n", qi, loc);
+        q->cur = 0;
+        q->end = 0;
+        pushQueueAt(q, 0, MQE_START, loc);
+    } else {
+        DEBUG_PRINTF("queue %u not finished, nfa lives [%lld]\n", qi, loc);
+    }
+
+done:
+    if (t->flushCombProgramOffset) {
+        if (roseRunFlushCombProgram(t, scratch, mpv_exec_end)
+                == HWLM_TERMINATE_MATCHING) {
+            return HWLM_TERMINATE_MATCHING;
+        }
+    }
+    updateMinMatchOffsetFromMpv(&scratch->tctxt, mpv_exec_end);
+    scratch->tctxt.next_mpv_offset
+        = MAX(next_pos_match_loc + scratch->core_info.buf_offset,
+              mpv_exec_end + 1);
+
+    DEBUG_PRINTF("next match loc %lld (off %llu)\n", next_pos_match_loc,
+                  scratch->tctxt.next_mpv_offset);
+    return can_stop_matching(scratch) ? HWLM_TERMINATE_MATCHING
+                                      : HWLM_CONTINUE_MATCHING;
+}
+
+static really_inline
+char in_mpv(const struct RoseEngine *rose, const struct hs_scratch *scratch) {
+    const struct RoseContext *tctxt = &scratch->tctxt;
+    assert(tctxt->curr_qi < rose->queueCount);
+    if (tctxt->curr_qi < rose->outfixBeginQueue) {
+        assert(getNfaByQueue(rose, tctxt->curr_qi)->type == MPV_NFA);
+        return 1;
+    }
+    return 0;
+}
+
+static
+int roseNfaBlastAdaptor(u64a start, u64a end, ReportID id, void *context) {
+    struct hs_scratch *scratch = context;
+    assert(scratch && scratch->magic == SCRATCH_MAGIC);
+    const struct RoseEngine *t = scratch->core_info.rose;
+
+    DEBUG_PRINTF("id=%u matched at [%llu,%llu]\n", id, start, end);
+
+    const char from_mpv = in_mpv(t, scratch);
+    int cb_rv = roseNfaRunProgram(t, scratch, start, end, id, from_mpv);
+    if (cb_rv == MO_HALT_MATCHING) {
+        return MO_HALT_MATCHING;
+    } else if (cb_rv == ROSE_CONTINUE_MATCHING_NO_EXHAUST) {
+        return MO_CONTINUE_MATCHING;
+    } else {
+        assert(cb_rv == MO_CONTINUE_MATCHING);
+        return !roseSuffixIsExhausted(t, scratch->tctxt.curr_qi,
+                                      scratch->core_info.exhaustionVector);
+    }
+}
+
+int roseNfaAdaptor(u64a start, u64a end, ReportID id, void *context) {
+    struct hs_scratch *scratch = context;
+    assert(scratch && scratch->magic == SCRATCH_MAGIC);
+
+    DEBUG_PRINTF("id=%u matched at [%llu,%llu]\n", id, start, end);
+
+    /* must be a external report as haig cannot directly participate in chain */
+    return roseNfaRunProgram(scratch->core_info.rose, scratch, start, end, id,
+                             0);
+}
+
+static really_inline
+char blast_queue(struct hs_scratch *scratch, struct mq *q, u32 qi, s64a to_loc,
+                 char report_current) {
+    scratch->tctxt.curr_qi = qi;
+    q->cb = roseNfaBlastAdaptor;
+    q->report_current = report_current;
+    DEBUG_PRINTF("queue %u blasting, %u/%u [%lld/%lld]\n", qi, q->cur, q->end,
+                 q_cur_loc(q), to_loc);
+    char alive = nfaQueueExec(q->nfa, q, to_loc);
+    q->cb = roseNfaAdaptor;
+    assert(!q->report_current);
+
+    return alive;
+}
+
+static really_inline
+hwlmcb_rv_t buildSufPQ_final(const struct RoseEngine *t, s64a report_ok_loc,
+                             s64a second_place_loc, s64a final_loc,
+                             struct hs_scratch *scratch, u8 *aa, u32 a_qi) {
+    struct mq *q = scratch->queues + a_qi;
+    const struct NfaInfo *info = getNfaInfoByQueue(t, a_qi);
+    DEBUG_PRINTF("blasting qi=%u to %lld [final %lld]\n", a_qi, second_place_loc,
+                 final_loc);
+
+    if (roseSuffixInfoIsExhausted(t, info,
+                                  scratch->core_info.exhaustionVector)) {
+        deactivateQueue(t, aa, a_qi, scratch);
+        return HWLM_CONTINUE_MATCHING;
+    }
+
+    ensureQueueActive(t, a_qi, t->queueCount, q, scratch);
+
+    if (unlikely(final_loc < q_cur_loc(q))) {
+        DEBUG_PRINTF("err loc %lld < location %lld\n", final_loc, q_cur_loc(q));
+        return HWLM_CONTINUE_MATCHING;
+    }
+
+    ensureEnd(q, a_qi, final_loc);
+
+    char alive = blast_queue(scratch, q, a_qi, second_place_loc, 0);
+
+    /* We have three possible outcomes:
+     * (1) the nfa died
+     * (2) we completed the queue (implies that second_place_loc == final_loc)
+     * (3) the queue ran to second_place_loc and stopped. In this case we need
+     *     to find the next match location.
+     */
+
+    if (!alive) {
+        if (can_stop_matching(scratch)) {
+            DEBUG_PRINTF("roseCatchUpNfas done as bailing\n");
+            return HWLM_TERMINATE_MATCHING;
+        }
+
+        deactivateQueue(t, aa, a_qi, scratch);
+    } else if (q->cur == q->end) {
+        DEBUG_PRINTF("queue %u finished, nfa lives [%lld]\n", a_qi, final_loc);
+
+        assert(second_place_loc == final_loc);
+
+        q->cur = q->end = 0;
+        pushQueueAt(q, 0, MQE_START, final_loc);
+    } else {
+        DEBUG_PRINTF("queue %u not finished, %u/%u [%lld/%lld]\n", a_qi, q->cur,
+                     q->end, q_cur_loc(q), final_loc);
+        DEBUG_PRINTF("finding next match location\n");
+
+        assert(second_place_loc < final_loc);
+        assert(q_cur_loc(q) >= second_place_loc);
+
+        if (runNewNfaToNextMatch(t, a_qi, q, final_loc, scratch, aa,
+                                 report_ok_loc) == HWLM_TERMINATE_MATCHING) {
+            DEBUG_PRINTF("roseCatchUpNfas done\n");
+            return HWLM_TERMINATE_MATCHING;
+        }
+    }
+
+    return HWLM_CONTINUE_MATCHING;
+}
+
+void streamInitSufPQ(const struct RoseEngine *t, char *state,
+                     struct hs_scratch *scratch) {
+    assert(scratch->catchup_pq.qm_size == 0);
+    assert(t->outfixBeginQueue != t->outfixEndQueue);
+
+    DEBUG_PRINTF("initSufPQ: outfixes [%u,%u)\n", t->outfixBeginQueue,
+                 t->outfixEndQueue);
+
+    u32 qCount = t->queueCount;
+    u8 *aa = getActiveLeafArray(t, state);
+    u32 aaCount = t->activeArrayCount;
+    struct mq *queues = scratch->queues;
+    size_t length = scratch->core_info.len;
+
+    u32 qi = mmbit_iterate_bounded(aa, aaCount, t->outfixBeginQueue,
+                                   t->outfixEndQueue);
+    for (; qi < t->outfixEndQueue;) {
+        DEBUG_PRINTF("adding qi=%u\n", qi);
+        struct mq *q = queues + qi;
+
+        ensureQueueActive(t, qi, qCount, q, scratch);
+        ensureEnd(q, qi, length);
+
+        char alive = nfaQueueExecToMatch(q->nfa, q, length);
+
+        if (alive == MO_MATCHES_PENDING) {
+            DEBUG_PRINTF("we have pending matches at %lld\n", q_cur_loc(q));
+            s64a qcl = q_cur_loc(q);
+
+            pq_insert_with(&scratch->catchup_pq, scratch, qi, qcl);
+        } else if (!alive) {
+            deactivateQueue(t, aa, qi, scratch);
+        } else {
+            assert(q->cur == q->end);
+            /* TODO: can this be simplified? the nfa will never produce any
+             * matches for this block. */
+            DEBUG_PRINTF("queue %u finished, nfa lives\n", qi);
+            q->cur = q->end = 0;
+            pushQueueAt(q, 0, MQE_START, length);
+        }
+
+        qi = mmbit_iterate_bounded(aa, aaCount, qi + 1, t->outfixEndQueue);
+    }
+}
+
+void blockInitSufPQ(const struct RoseEngine *t, char *state,
+                    struct hs_scratch *scratch, char is_small_block) {
+    DEBUG_PRINTF("initSufPQ: outfixes [%u,%u)\n", t->outfixBeginQueue,
+                 t->outfixEndQueue);
+
+    assert(scratch->catchup_pq.qm_size == 0);
+    assert(t->outfixBeginQueue != t->outfixEndQueue);
+
+    struct mq *queues = scratch->queues;
+    u8 *aa = getActiveLeafArray(t, state);
+    struct fatbit *aqa = scratch->aqa;
+    u32 aaCount = t->activeArrayCount;
+    u32 qCount = t->queueCount;
+    size_t length = scratch->core_info.len;
+
+    for (u32 qi = t->outfixBeginQueue; qi < t->outfixEndQueue; qi++) {
+        const struct NfaInfo *info = getNfaInfoByQueue(t, qi);
+
+        if (is_small_block && info->in_sbmatcher) {
+            DEBUG_PRINTF("skip outfix %u as it's in the SB matcher\n", qi);
+            continue;
+        }
+
+        const struct NFA *nfa = getNfaByInfo(t, info);
+        DEBUG_PRINTF("testing minwidth %u > len %zu\n", nfa->minWidth,
+                      length);
+        size_t len = nfaRevAccelCheck(nfa, scratch->core_info.buf, length);
+        if (!len) {
+            continue;
+        }
+        mmbit_set(aa, aaCount, qi);
+        fatbit_set(aqa, qCount, qi);
+        struct mq *q = queues + qi;
+        initQueue(q, qi, t, scratch);
+        q->length = len; /* adjust for rev_accel */
+        nfaQueueInitState(nfa, q);
+        pushQueueAt(q, 0, MQE_START, 0);
+        pushQueueAt(q, 1, MQE_TOP, 0);
+        pushQueueAt(q, 2, MQE_END, length);
+
+        DEBUG_PRINTF("adding qi=%u to pq\n", qi);
+
+        char alive = nfaQueueExecToMatch(q->nfa, q, length);
+
+        if (alive == MO_MATCHES_PENDING) {
+            DEBUG_PRINTF("we have pending matches at %lld\n", q_cur_loc(q));
+            s64a qcl = q_cur_loc(q);
+
+            pq_insert_with(&scratch->catchup_pq, scratch, qi, qcl);
+        } else if (!alive) {
+            deactivateQueue(t, aa, qi, scratch);
+        } else {
+            assert(q->cur == q->end);
+            /* TODO: can this be simplified? the nfa will never produce any
+             * matches for this block. */
+            DEBUG_PRINTF("queue %u finished, nfa lives\n", qi);
+            q->cur = q->end = 0;
+            pushQueueAt(q, 0, MQE_START, length);
+        }
+    }
+}
+
+/**
+ * safe_loc is ???
+ */
+static rose_inline
+hwlmcb_rv_t buildSufPQ(const struct RoseEngine *t, char *state, s64a safe_loc,
+                       s64a final_loc, struct hs_scratch *scratch) {
+    assert(scratch->catchup_pq.qm_size <= t->outfixEndQueue);
+
+    struct RoseContext *tctxt = &scratch->tctxt;
+    assert(t->activeArrayCount);
+
+    assert(scratch->core_info.buf_offset + final_loc
+           > tctxt->minNonMpvMatchOffset);
+    DEBUG_PRINTF("buildSufPQ final loc %lld (safe %lld)\n", final_loc,
+                 safe_loc);
+    assert(safe_loc <= final_loc);
+
+    u8 *aa = getActiveLeafArray(t, state);
+    u32 aaCount = t->activeArrayCount;
+
+    /* find first match of each pending nfa */
+    DEBUG_PRINTF("aa=%p, aaCount=%u\n", aa, aaCount);
+
+    /* Note: mpv MUST not participate in the main priority queue as
+     * they may have events pushed on during this process which may be before
+     * the catch up point. Outfixes are remain in the pq between catchup events
+     * as they never have any incoming events to worry about.
+     */
+    if (aaCount == t->outfixEndQueue) {
+        return HWLM_CONTINUE_MATCHING;
+    }
+
+    DEBUG_PRINTF("mib %u/%u\n", t->outfixBeginQueue, aaCount);
+
+    u32 a_qi = mmbit_iterate_bounded(aa, aaCount, t->outfixEndQueue, aaCount);
+
+    if (a_qi == MMB_INVALID) {
+        return HWLM_CONTINUE_MATCHING;
+    }
+
+    s64a report_ok_loc = tctxt->minNonMpvMatchOffset + 1
+        - scratch->core_info.buf_offset;
+
+    hwlmcb_rv_t rv = roseCatchUpMPV(t, report_ok_loc, scratch);
+    if (rv != HWLM_CONTINUE_MATCHING) {
+        DEBUG_PRINTF("terminating...\n");
+        return rv;
+    }
+
+    while (a_qi != MMB_INVALID) {
+        DEBUG_PRINTF("catching up qi=%u to %lld\n", a_qi, final_loc);
+        u32 n_qi = mmbit_iterate(aa, aaCount, a_qi);
+
+        s64a second_place_loc
+            = scratch->catchup_pq.qm_size ? pq_top_loc(&scratch->catchup_pq)
+                                          : safe_loc;
+        second_place_loc = MIN(second_place_loc, safe_loc);
+        if (n_qi == MMB_INVALID && report_ok_loc <= second_place_loc) {
+            if (buildSufPQ_final(t, report_ok_loc, second_place_loc, final_loc,
+                                 scratch, aa, a_qi)
+                == HWLM_TERMINATE_MATCHING) {
+                return HWLM_TERMINATE_MATCHING;
+            }
+            break;
+        }
+
+        if (add_to_queue(t, scratch->queues, t->queueCount, aa, scratch,
+                         final_loc, a_qi, report_ok_loc)
+            == HWLM_TERMINATE_MATCHING) {
+            DEBUG_PRINTF("roseCatchUpNfas done\n");
+            return HWLM_TERMINATE_MATCHING;
+        }
+
+        a_qi = n_qi;
+    }
+
+    DEBUG_PRINTF("PQ BUILD %u items\n", scratch->catchup_pq.qm_size);
+    return HWLM_CONTINUE_MATCHING;
+}
+
+static never_inline
+hwlmcb_rv_t roseCatchUpNfas(const struct RoseEngine *t, s64a loc,
+                            s64a final_loc, struct hs_scratch *scratch) {
+    assert(t->activeArrayCount);
+
+    DEBUG_PRINTF("roseCatchUpNfas offset=%llu + %lld/%lld\n",
+                 scratch->core_info.buf_offset, loc, final_loc);
+    DEBUG_PRINTF("min non mpv match offset %llu\n",
+                 scratch->tctxt.minNonMpvMatchOffset);
+
+    struct RoseContext *tctxt = &scratch->tctxt;
+    assert(scratch->core_info.buf_offset + loc >= tctxt->minNonMpvMatchOffset);
+
+    char *state = scratch->core_info.state;
+    struct mq *queues = scratch->queues;
+    u8 *aa = getActiveLeafArray(t, state);
+
+    /* fire off earliest nfa match and catchup anchored matches to that point */
+    while (scratch->catchup_pq.qm_size) {
+        s64a match_loc = pq_top_loc(&scratch->catchup_pq);
+        u32 qi = pq_top(scratch->catchup_pq.qm)->queue;
+
+        DEBUG_PRINTF("winrar q%u@%lld loc %lld\n", qi, match_loc, loc);
+        assert(match_loc + scratch->core_info.buf_offset
+               >= scratch->tctxt.minNonMpvMatchOffset);
+
+        if (match_loc > loc) {
+            /* we have processed all the matches at or before rose's current
+             * location; only things remaining on the pq should be outfixes. */
+            DEBUG_PRINTF("saving for later\n");
+            goto exit;
+        }
+
+        /* catch up char matches to this point */
+        if (roseCatchUpMPV(t, match_loc, scratch)
+            == HWLM_TERMINATE_MATCHING) {
+            DEBUG_PRINTF("roseCatchUpNfas done\n");
+            return HWLM_TERMINATE_MATCHING;
+        }
+
+        assert(match_loc + scratch->core_info.buf_offset
+               >= scratch->tctxt.minNonMpvMatchOffset);
+
+        struct mq *q = queues + qi;
+
+        /* outfixes must be advanced all the way as they persist in the pq
+         * between catchup events */
+        s64a q_final_loc = qi >= t->outfixEndQueue ? final_loc
+                                                 : (s64a)scratch->core_info.len;
+
+        /* fire nfa matches, and find next place this nfa match */
+        DEBUG_PRINTF("reporting matches %u@%llu [q->cur %u/%u]\n", qi,
+                     match_loc, q->cur, q->end);
+
+        /* we then need to catch this nfa up to next earliest nfa match. These
+         * matches can be fired directly from the callback. The callback needs
+         * to ensure that the anchored matches remain in sync though */
+        s64a second_place_loc = findSecondPlace(&scratch->catchup_pq, loc);
+        DEBUG_PRINTF("second place %lld loc %lld\n", second_place_loc, loc);
+
+        if (second_place_loc == q_cur_loc(q)) {
+            if (runExistingNfaToNextMatch(t, qi, q, q_final_loc, scratch, aa, 1)
+                == HWLM_TERMINATE_MATCHING) {
+                return HWLM_TERMINATE_MATCHING;
+            }
+            continue;
+        }
+
+        char alive = blast_queue(scratch, q, qi, second_place_loc, 1);
+
+        if (!alive) {
+            if (can_stop_matching(scratch)) {
+                DEBUG_PRINTF("roseCatchUpNfas done as bailing\n");
+                return HWLM_TERMINATE_MATCHING;
+            }
+
+            deactivateQueue(t, aa, qi, scratch);
+            pq_pop_nice(&scratch->catchup_pq);
+        } else if (q->cur == q->end) {
+            DEBUG_PRINTF("queue %u finished, nfa lives [%lld]\n", qi, loc);
+            q->cur = q->end = 0;
+            pushQueueAt(q, 0, MQE_START, loc);
+            pq_pop_nice(&scratch->catchup_pq);
+        } else if (second_place_loc == q_final_loc) {
+            DEBUG_PRINTF("queue %u on hold\n", qi);
+            pq_pop_nice(&scratch->catchup_pq);
+            break;
+        } else {
+            DEBUG_PRINTF("queue %u not finished, %u/%u [%lld/%lld]\n",
+                          qi, q->cur, q->end, q->items[q->cur].location, loc);
+            runExistingNfaToNextMatch(t, qi, q, q_final_loc, scratch, aa, 0);
+        }
+    }
+exit:;
+    tctxt->minNonMpvMatchOffset = scratch->core_info.buf_offset + loc;
+    DEBUG_PRINTF("roseCatchUpNfas done\n");
+    return HWLM_CONTINUE_MATCHING;
+}
+
+hwlmcb_rv_t roseCatchUpAll(s64a loc, struct hs_scratch *scratch) {
+    /* just need suf/outfixes and mpv */
+    DEBUG_PRINTF("loc %lld mnmmo %llu mmo %llu\n", loc,
+                 scratch->tctxt.minNonMpvMatchOffset,
+                 scratch->tctxt.minMatchOffset);
+    assert(scratch->core_info.buf_offset + loc
+           > scratch->tctxt.minNonMpvMatchOffset);
+
+    const struct RoseEngine *t = scratch->core_info.rose;
+    char *state = scratch->core_info.state;
+
+    hwlmcb_rv_t rv = buildSufPQ(t, state, loc, loc, scratch);
+    if (rv != HWLM_CONTINUE_MATCHING) {
+        return rv;
+    }
+
+    rv = roseCatchUpNfas(t, loc, loc, scratch);
+    if (rv != HWLM_CONTINUE_MATCHING) {
+        return rv;
+    }
+
+    rv = roseCatchUpMPV(t, loc, scratch);
+    assert(rv != HWLM_CONTINUE_MATCHING
+           || scratch->catchup_pq.qm_size <= t->outfixEndQueue);
+    assert(!can_stop_matching(scratch) || rv == HWLM_TERMINATE_MATCHING);
+    return rv;
+}
+
+hwlmcb_rv_t roseCatchUpSuf(s64a loc, struct hs_scratch *scratch) {
+    /* just need suf/outfixes. mpv will be caught up only to last reported
+     * external match */
+    assert(scratch->core_info.buf_offset + loc
+           > scratch->tctxt.minNonMpvMatchOffset);
+
+    const struct RoseEngine *t = scratch->core_info.rose;
+    char *state = scratch->core_info.state;
+
+    hwlmcb_rv_t rv = buildSufPQ(t, state, loc, loc, scratch);
+    if (rv != HWLM_CONTINUE_MATCHING) {
+        return rv;
+    }
+
+    rv = roseCatchUpNfas(t, loc, loc, scratch);
+    assert(rv != HWLM_CONTINUE_MATCHING ||
+           scratch->catchup_pq.qm_size <= t->outfixEndQueue);
+
+    return rv;
+}
diff --git a/regex/rose/catchup.h b/regex/rose/catchup.h
new file mode 100644
index 000000000..8188d5af0
--- /dev/null
+++ b/regex/rose/catchup.h
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2015-2018, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file
+ * \brief Rose runtime: code for catching up output-exposed engines.
+ *
+ * Rose has several components which run behind the main (floating table) clock
+ * and need to be caught up before we report matches.
+ *
+ * Currently we have to deal with:
+ * 1. Suffix/Outfix NFAs
+ * 2. A single MPV NFA (chained), which may also be triggered by (1).
+ *
+ * The approach is to:
+ * - (A) build a priority queue of the suffix/outfixes based on their first
+ *       match location;
+ * - (B) process the matches from the priority queue in order;
+ * - (C) As we report matches from (B) we interleave matches from the MPV if it
+ *       exists.
+ */
+
+#ifndef ROSE_CATCHUP_H
+#define ROSE_CATCHUP_H
+
+#include "hwlm/hwlm.h"
+#include "runtime.h"
+#include "scratch.h"
+#include "rose.h"
+#include "rose_common.h"
+#include "rose_internal.h"
+#include "ue2common.h"
+#include "util/multibit.h"
+
+hwlmcb_rv_t roseCatchUpAll(s64a loc, struct hs_scratch *scratch);
+
+/* will only catch mpv up to last reported external match */
+hwlmcb_rv_t roseCatchUpSuf(s64a loc, struct hs_scratch *scratch);
+
+hwlmcb_rv_t roseCatchUpMPV_i(const struct RoseEngine *t, s64a loc,
+                             struct hs_scratch *scratch);
+
+void blockInitSufPQ(const struct RoseEngine *t, char *state,
+                    struct hs_scratch *scratch, char is_small_block);
+void streamInitSufPQ(const struct RoseEngine *t, char *state,
+                     struct hs_scratch *scratch);
+
+static really_inline
+int canSkipCatchUpMPV(const struct RoseEngine *t, struct hs_scratch *scratch,
+                      u64a cur_offset) {
+    if (!has_chained_nfas(t)) {
+        return 1;
+    }
+
+    /* note: we may have to run at less than tctxt.minMatchOffset as we may
+     * have a full queue of postponed events that we need to flush */
+    if (cur_offset < scratch->tctxt.next_mpv_offset) {
+        DEBUG_PRINTF("skipping cur_offset %llu min %llu, mpv %llu\n",
+                      cur_offset, scratch->tctxt.minMatchOffset,
+                      scratch->tctxt.next_mpv_offset);
+        return 1;
+    }
+
+    assert(t->activeArrayCount);
+
+    DEBUG_PRINTF("cur offset offset: %llu\n", cur_offset);
+    DEBUG_PRINTF("min match offset %llu\n", scratch->tctxt.minMatchOffset);
+
+    assert(t->outfixBeginQueue == 1); /* if it exists mpv is queue 0 */
+
+    const u8 *aa = getActiveLeafArray(t, scratch->core_info.state);
+    return !mmbit_isset(aa, t->activeArrayCount, 0);
+}
+
+/** \brief Catches up the MPV. */
+static really_inline
+hwlmcb_rv_t roseCatchUpMPV(const struct RoseEngine *t, s64a loc,
+                           struct hs_scratch *scratch) {
+    u64a cur_offset = loc + scratch->core_info.buf_offset;
+    assert(cur_offset >= scratch->tctxt.minMatchOffset);
+    assert(!can_stop_matching(scratch));
+
+    if (canSkipCatchUpMPV(t, scratch, cur_offset)) {
+        if (t->flushCombProgramOffset) {
+            if (roseRunFlushCombProgram(t, scratch, cur_offset)
+                    == HWLM_TERMINATE_MATCHING) {
+                return HWLM_TERMINATE_MATCHING;
+            }
+        }
+        updateMinMatchOffsetFromMpv(&scratch->tctxt, cur_offset);
+        return HWLM_CONTINUE_MATCHING;
+    }
+
+    /* Note: chained tails MUST not participate in the priority queue as
+     * they may have events pushed on during this process which may be before
+     * the catch up point */
+
+    return roseCatchUpMPV_i(t, loc, scratch);
+}
+
+/** \brief Catches up NFAs and the MPV. */
+static rose_inline
+hwlmcb_rv_t roseCatchUpTo(const struct RoseEngine *t,
+                          struct hs_scratch *scratch, u64a end) {
+    /* no need to catch up if we are at the same offset as last time */
+    if (end <= scratch->tctxt.minMatchOffset) {
+        /* we must already be up to date */
+        DEBUG_PRINTF("skip\n");
+        return HWLM_CONTINUE_MATCHING;
+    }
+
+    char *state = scratch->core_info.state;
+    s64a loc = end - scratch->core_info.buf_offset;
+
+    if (end <= scratch->tctxt.minNonMpvMatchOffset) {
+        /* only need to catch up the mpv */
+        return roseCatchUpMPV(t, loc, scratch);
+    }
+
+    assert(scratch->tctxt.minMatchOffset >= scratch->core_info.buf_offset);
+    hwlmcb_rv_t rv;
+    if (!t->activeArrayCount
+        || !mmbit_any(getActiveLeafArray(t, state), t->activeArrayCount)) {
+        if (t->flushCombProgramOffset) {
+            if (roseRunFlushCombProgram(t, scratch, end)
+                    == HWLM_TERMINATE_MATCHING) {
+                return HWLM_TERMINATE_MATCHING;
+            }
+        }
+        updateMinMatchOffset(&scratch->tctxt, end);
+        rv = HWLM_CONTINUE_MATCHING;
+    } else {
+        rv = roseCatchUpAll(loc, scratch);
+    }
+
+    assert(rv != HWLM_CONTINUE_MATCHING
+           || scratch->tctxt.minMatchOffset == end);
+    assert(rv != HWLM_CONTINUE_MATCHING
+           || scratch->tctxt.minNonMpvMatchOffset == end);
+    assert(!can_stop_matching(scratch) || rv == HWLM_TERMINATE_MATCHING);
+    return rv;
+}
+
+/**
+ * \brief Catches up anything which may add triggers on the MPV (suffixes and
+ * outfixes).
+ *
+ * The MPV will be run only to intersperse matches in the output match stream
+ * if external matches are raised.
+ */
+static rose_inline
+hwlmcb_rv_t roseCatchUpMpvFeeders(const struct RoseEngine *t,
+                                  struct hs_scratch *scratch, u64a end) {
+    /* no need to catch up if we are at the same offset as last time */
+    if (end <= scratch->tctxt.minNonMpvMatchOffset) {
+        /* we must already be up to date */
+        DEBUG_PRINTF("skip\n");
+        return HWLM_CONTINUE_MATCHING;
+    }
+
+    s64a loc = end - scratch->core_info.buf_offset;
+
+    assert(t->activeArrayCount); /* mpv is in active array */
+    assert(scratch->tctxt.minMatchOffset >= scratch->core_info.buf_offset);
+
+    if (!t->mpvTriggeredByLeaf) {
+        /* no need to check as they never put triggers onto the mpv */
+        return HWLM_CONTINUE_MATCHING;
+    }
+
+    /* sadly, this branch rarely gets taken as the mpv itself is usually
+     * alive. */
+    char *state = scratch->core_info.state;
+    if (!mmbit_any(getActiveLeafArray(t, state), t->activeArrayCount)) {
+        scratch->tctxt.minNonMpvMatchOffset = end;
+        return HWLM_CONTINUE_MATCHING;
+    }
+
+    return roseCatchUpSuf(loc, scratch);
+}
+
+#endif
diff --git a/regex/rose/counting_miracle.h b/regex/rose/counting_miracle.h
new file mode 100644
index 000000000..976208b73
--- /dev/null
+++ b/regex/rose/counting_miracle.h
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef ROSE_COUNTING_MIRACLE_H
+#define ROSE_COUNTING_MIRACLE_H
+
+#include "ue2common.h"
+#include "runtime.h"
+#include "rose_internal.h"
+#include "nfa/nfa_api_queue.h"
+#include "util/simd_utils.h"
+
+/** \brief Maximum number of bytes to scan when looking for a "counting miracle"
+ * stop character. */
+#define COUNTING_MIRACLE_LEN_MAX 256
+
+static really_inline
+char roseCountingMiracleScan(u8 c, const u8 *d, const u8 *d_end,
+                             u32 target_count, u32 *count_inout,
+                             const u8 **d_out) {
+    assert(d <= d_end);
+
+    u32 count = *count_inout;
+
+    m128 chars = set16x8(c);
+
+    for (; d + 16 <= d_end; d_end -= 16) {
+        m128 data = loadu128(d_end - 16);
+        u32 z1 = movemask128(eq128(chars, data));
+        count += popcount32(z1);
+
+        if (count >= target_count) {
+            *d_out = d_end - 16;
+            *count_inout = count;
+            return 1;
+        }
+    }
+
+    if (d != d_end) {
+        char temp[sizeof(m128)];
+        assert(d + sizeof(temp) > d_end);
+        memset(temp, c + 1, sizeof(temp));
+        memcpy(temp, d, d_end - d);
+        m128 data = loadu128(temp);
+        u32 z1 = movemask128(eq128(chars, data));
+        count += popcount32(z1);
+
+        if (count >= target_count) {
+            *d_out = d;
+            *count_inout = count;
+            return 1;
+        }
+    }
+
+    *count_inout = count;
+    return 0;
+}
+
+#define GET_LO_4(chars) and128(chars, low4bits)
+#define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4)
+
+static really_inline
+u32 roseCountingMiracleScanShufti(m128 mask_lo, m128 mask_hi, u8 poison,
+                                  const u8 *d, const u8 *d_end,
+                                  u32 target_count, u32 *count_inout,
+                                  const u8 **d_out) {
+    assert(d <= d_end);
+
+    u32 count = *count_inout;
+
+    const m128 zeroes = zeroes128();
+    const m128 low4bits = _mm_set1_epi8(0xf);
+
+    for (; d + 16 <= d_end; d_end -= 16) {
+        m128 data = loadu128(d_end - 16);
+        m128 c_lo  = pshufb_m128(mask_lo, GET_LO_4(data));
+        m128 c_hi  = pshufb_m128(mask_hi, GET_HI_4(data));
+        m128 t     = and128(c_lo, c_hi);
+        u32 z1 = movemask128(eq128(t, zeroes));
+        count += popcount32(z1 ^ 0xffff);
+
+        if (count >= target_count) {
+            *d_out = d_end - 16;
+            *count_inout = count;
+            return 1;
+        }
+    }
+
+    if (d != d_end) {
+        char temp[sizeof(m128)];
+        assert(d + sizeof(temp) > d_end);
+        memset(temp, poison, sizeof(temp));
+        memcpy(temp, d, d_end - d);
+        m128 data  = loadu128(temp);
+        m128 c_lo  = pshufb_m128(mask_lo, GET_LO_4(data));
+        m128 c_hi  = pshufb_m128(mask_hi, GET_HI_4(data));
+        m128 t     = and128(c_lo, c_hi);
+        u32 z1 = movemask128(eq128(t, zeroes));
+        count += popcount32(z1 ^ 0xffff);
+
+        if (count >= target_count) {
+            *d_out = d;
+            *count_inout = count;
+            return 1;
+        }
+    }
+
+    *count_inout = count;
+    return 0;
+}
+
+/**
+ * \brief "Counting Miracle" scan: If we see more than N instances of a
+ * particular character class we know that the engine must be dead.
+ *
+ * Scans the buffer/history between relative locations \a begin_loc and \a
+ * end_loc, and returns a miracle location (if any) that appears in the stream
+ * after \a begin_loc.
+ *
+ * Returns 1 if some bytes can be skipped and sets \a miracle_loc
+ * appropriately, 0 otherwise.
+ */
+static never_inline
+int roseCountingMiracleOccurs(const struct RoseEngine *t,
+                              const struct LeftNfaInfo *left,
+                              const struct core_info *ci, s64a begin_loc,
+                              const s64a end_loc, s64a *miracle_loc) {
+    if (!left->countingMiracleOffset) {
+        return 0;
+    }
+
+    const struct RoseCountingMiracle *cm
+        = (const void *)((const char *)t + left->countingMiracleOffset);
+
+    assert(!left->transient);
+    assert(cm->count > 1); /* should be a normal miracle then */
+
+    DEBUG_PRINTF("looking for counting miracle over [%lld,%lld], maxLag=%u\n",
+                 begin_loc, end_loc, left->maxLag);
+    DEBUG_PRINTF("ci->len=%zu, ci->hlen=%zu\n", ci->len, ci->hlen);
+
+    assert(begin_loc <= end_loc);
+    assert(begin_loc >= -(s64a)ci->hlen);
+    assert(end_loc <= (s64a)ci->len);
+
+    const s64a scan_end_loc = end_loc - left->maxLag;
+    if (scan_end_loc <= begin_loc) {
+        DEBUG_PRINTF("nothing to scan\n");
+        return 0;
+    }
+
+    const s64a start = MAX(begin_loc, scan_end_loc - COUNTING_MIRACLE_LEN_MAX);
+    DEBUG_PRINTF("scan [%lld..%lld]\n", start, scan_end_loc);
+
+    u32 count = 0;
+
+    s64a m_loc = start;
+
+    if (!cm->shufti) {
+        u8 c = cm->c;
+
+        // Scan buffer.
+        const s64a buf_scan_start = MAX(0, start);
+        if (scan_end_loc > buf_scan_start) {
+            const u8 *buf = ci->buf;
+            const u8 *d = buf + scan_end_loc;
+            const u8 *d_start = buf + buf_scan_start;
+            const u8 *d_out;
+            if (roseCountingMiracleScan(c, d_start, d, cm->count, &count,
+                                        &d_out)) {
+                assert(d_out >= d_start);
+                m_loc = (d_out - d_start) + buf_scan_start;
+                goto success;
+            }
+        }
+
+        // Scan history.
+        if (start < 0) {
+            const u8 *hbuf_end = ci->hbuf + ci->hlen;
+            const u8 *d = hbuf_end + MIN(0, scan_end_loc);
+            const u8 *d_start = hbuf_end + start;
+            const u8 *d_out;
+            if (roseCountingMiracleScan(c, d_start, d, cm->count, &count,
+                                        &d_out)) {
+                assert(d_out >= d_start);
+                m_loc = (d_out - d_start) + start;
+                goto success;
+            }
+        }
+    } else {
+        m128 lo = cm->lo;
+        m128 hi = cm->hi;
+        u8 poison = cm->poison;
+
+        // Scan buffer.
+        const s64a buf_scan_start = MAX(0, start);
+        if (scan_end_loc > buf_scan_start) {
+            const u8 *buf = ci->buf;
+            const u8 *d = buf + scan_end_loc;
+            const u8 *d_start = buf + buf_scan_start;
+            const u8 *d_out;
+            if (roseCountingMiracleScanShufti(lo, hi, poison, d_start, d,
+                                              cm->count, &count, &d_out)) {
+                assert(d_out >= d_start);
+                m_loc = (d_out - d_start) + buf_scan_start;
+                goto success;
+            }
+        }
+
+        // Scan history.
+        if (start < 0) {
+            const u8 *hbuf_end = ci->hbuf + ci->hlen;
+            const u8 *d = hbuf_end + MIN(0, scan_end_loc);
+            const u8 *d_start = hbuf_end + start;
+            const u8 *d_out;
+            if (roseCountingMiracleScanShufti(lo, hi, poison, d_start, d,
+                                              cm->count, &count, &d_out)) {
+                assert(d_out >= d_start);
+                m_loc = (d_out - d_start) + start;
+                goto success;
+            }
+        }
+    }
+
+    DEBUG_PRINTF("found %u/%u\n", count, cm->count);
+    return 0;
+
+success:
+    DEBUG_PRINTF("found %u/%u\n", count, cm->count);
+    assert(count >= cm->count);
+    assert(m_loc < scan_end_loc);
+    assert(m_loc >= start);
+
+    *miracle_loc = m_loc;
+    return 1;
+}
+
+#endif
diff --git a/regex/rose/infix.h b/regex/rose/infix.h
new file mode 100644
index 000000000..9cf9c0ad7
--- /dev/null
+++ b/regex/rose/infix.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef INFIX_H
+#define INFIX_H
+
+#include "ue2common.h"
+#include "nfa/nfa_api.h"
+#include "nfa/nfa_api_queue.h"
+#include "nfa/nfa_internal.h"
+
+static really_inline
+int infixTooOld(struct mq *q, s64a curr_loc) {
+    u32 maxAge = q->nfa->maxWidth;
+
+    if (!maxAge) {
+        return 0;
+    }
+
+    return q_last_loc(q) + maxAge < curr_loc;
+}
+
+static really_inline
+int canReduceQueue(const struct mq *q, s64a curr_loc, u32 maxTops, u32 maxAge) {
+    u32 qlen = q->end - q->cur; /* includes MQE_START */
+
+    if (maxAge && q->items[q->cur].location + maxAge < curr_loc) {
+        return 1;
+    }
+
+    if (qlen - 1 > maxTops) {
+        return 1;
+    }
+
+    if (qlen - 1 == maxTops
+        && q->items[q->cur].location != q->items[q->cur + 1].location) {
+        /* we can advance start to the first top location */
+        return 1;
+    }
+
+    return 0;
+}
+
+/**
+ * Removes tops which are known not to affect the final state from the queue.
+ * May also reinitialise the engine state if it is unneeded.
+ *
+ * maxAge is the maximum width of the infix. Any tops/state before this can be
+ * ignored. 0 is used to indicate that there is no upper bound on the width of
+ * the pattern.
+ *
+ * maxTops is the maximum number of locations of tops that can affect the top.
+ * It is only possible for the last maxTops tops to affect the final state -
+ * earlier ones can be safely removed. Also, any state before the max tops may
+ * be ignored.
+ *
+ * This code assumes/requires that there are not multiple tops at the same
+ * location in the queue. This code also assumes that it is not a multitop
+ * engine.
+ */
+static really_inline
+void reduceInfixQueue(struct mq *q, s64a curr_loc, u32 maxTops, u32 maxAge) {
+    assert(q->end > q->cur);
+    assert(maxTops);
+    u32 qlen = q->end - q->cur; /* includes MQE_START */
+    DEBUG_PRINTF("q=%p, len=%u, maxTops=%u maxAge=%u\n", q, qlen, maxTops,
+                 maxAge);
+
+    if (!canReduceQueue(q, curr_loc, maxTops, maxAge)) {
+        DEBUG_PRINTF("nothing to do\n");
+        return;
+    }
+
+#ifdef DEBUG
+    debugQueue(q);
+#endif
+
+    char drop_state = qlen - 1 >= maxTops
+        || (maxAge && q->items[q->cur].location + maxAge < curr_loc);
+
+    LIMIT_TO_AT_MOST(&maxTops, qlen - 1);
+
+    // We leave our START where it is, at the front of the queue.
+    assert(q->items[q->cur].type == MQE_START);
+
+    // We want to shuffle maxQueueLen items from the end of the queue to just
+    // after the start, effectively dequeuing old items. We could use memmove
+    // for this, but it's probably not a good idea to take the cost of the
+    // function call.
+    const struct mq_item *src = &q->items[q->cur + qlen - maxTops];
+
+    q->items[0] = q->items[q->cur]; /* shift start event to 0 slot */
+    q->cur = 0;
+    q->end = 1;
+    struct mq_item *dst = &q->items[1];
+    u32 i = 0;
+    if (maxAge) {
+        /* any event which is older than maxAge can be dropped */
+        for (; i < maxTops; i++, src++) {
+            if (src->location >= curr_loc - maxAge) {
+                break;
+            }
+        }
+    }
+
+    for (; i < maxTops; i++) {
+        *dst = *src;
+        src++;
+        dst++;
+        q->end++;
+    }
+
+    if (drop_state) {
+        /* clear state and shift start up to first top */
+        s64a new_loc;
+        if (q->end > 1) {
+            new_loc = q->items[1].location;
+        } else {
+            DEBUG_PRINTF("no tops\n");
+            new_loc = curr_loc;
+        }
+
+        DEBUG_PRINTF("advancing start from %lld to %lld\n",
+                     q->items[0].location, new_loc);
+        assert(new_loc > q->items[0].location);
+        q->items[0].location = new_loc;
+        nfaQueueInitState(q->nfa, q);
+    }
+
+    DEBUG_PRINTF("reduced queue to len=%u\n", q->end - q->cur);
+#ifdef DEBUG
+    debugQueue(q);
+#endif
+}
+
+#endif
diff --git a/regex/rose/init.c b/regex/rose/init.c
new file mode 100644
index 000000000..761024d1a
--- /dev/null
+++ b/regex/rose/init.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "init.h"
+#include "match.h"
+#include "runtime.h"
+#include "scratch.h"
+#include "rose.h"
+#include "rose_common.h"
+#include "rose_internal.h"
+#include "ue2common.h"
+#include "nfa/mcclellan.h"
+#include "nfa/nfa_api_util.h"
+#include "nfa/nfa_internal.h"
+#include "util/multibit.h"
+
+#ifndef __KERNEL__
+#include <string.h>
+#else
+#include <linux/string.h>
+#endif
+
+static really_inline
+void init_rstate(const struct RoseEngine *t, char *state) {
+    // Set runtime state: we take our initial groups from the RoseEngine.
+    DEBUG_PRINTF("setting initial groups to 0x%016llx\n", t->initialGroups);
+    storeGroups(t, state, t->initialGroups);
+}
+
+static really_inline
+void init_outfixes(const struct RoseEngine *t, char *state) {
+    /* The active leaf array has been init'ed by the scatter with outfix
+     * bits set on */
+
+    // Init the NFA state for each outfix.
+    for (u32 qi = t->outfixBeginQueue; qi < t->outfixEndQueue; qi++) {
+        const struct NfaInfo *info = getNfaInfoByQueue(t, qi);
+        const struct NFA *nfa = getNfaByInfo(t, info);
+        nfaInitCompressedState(nfa, 0, state + info->stateOffset,
+                               0 /* assume NUL at start */);
+    }
+
+    if (t->initMpvNfa != MO_INVALID_IDX) {
+        const struct NfaInfo *info = getNfaInfoByQueue(t, t->initMpvNfa);
+        const struct NFA *nfa = getNfaByInfo(t, info);
+        nfaInitCompressedState(nfa, 0, state + info->stateOffset,
+                               0 /* assume NUL at start */);
+        mmbit_set(getActiveLeafArray(t, state), t->activeArrayCount,
+                  t->initMpvNfa);
+    }
+}
+
+void roseInitState(const struct RoseEngine *t, char *state) {
+    assert(t);
+    assert(state);
+
+    DEBUG_PRINTF("init for Rose %p with %u state indices)\n", t,
+                 t->rolesWithStateCount);
+
+    // Rose is guaranteed 8-aligned state
+    assert(ISALIGNED_N(state, 8));
+
+    init_rstate(t, state);
+
+    init_state(t, state);
+    init_outfixes(t, state);
+}
diff --git a/regex/rose/init.h b/regex/rose/init.h
new file mode 100644
index 000000000..b37053b26
--- /dev/null
+++ b/regex/rose/init.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef ROSE_INIT_H
+#define ROSE_INIT_H
+
+#include "rose_internal.h"
+#include "ue2common.h"
+#include "util/scatter_runtime.h"
+
+/*
+ * Initialisation code common to streaming mode Rose (init.c) and block mode
+ * Rose (block.c) code.
+ */
+
+static really_inline
+void init_state(const struct RoseEngine *t, char *state) {
+    scatter(state, t, &t->state_init);
+}
+
+#endif // ROSE_INIT_H
diff --git a/regex/rose/match.c b/regex/rose/match.c
new file mode 100644
index 000000000..023db3860
--- /dev/null
+++ b/regex/rose/match.c
@@ -0,0 +1,632 @@
+/*
+ * Copyright (c) 2015-2019, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "catchup.h"
+#include "match.h"
+#include "program_runtime.h"
+#include "rose.h"
+#include "util/bitutils.h"
+#include "util/fatbit.h"
+
+#if defined(DEBUG) || defined(DUMP_SUPPORT)
+#include "util/compare.h"
+/** A debugging crutch: print a hex-escaped version of the match for our
+ * perusal. The start and end offsets are stream offsets. */
+static UNUSED
+void printMatch(const struct core_info *ci, u64a start, u64a end) {
+    assert(start <= end);
+    assert(end <= ci->buf_offset + ci->len);
+
+    DEBUG_PRINTF("'");
+    u64a i = start;
+    for (; i <= MIN(ci->buf_offset, end); i++) {
+        u64a h_idx = ci->buf_offset - i;
+        u8 c = h_idx >= ci->hlen ? '?' : ci->hbuf[ci->hlen - h_idx - 1];
+        if (ourisprint(c) && c != '\'') {
+            DEBUG_PRINTF("%c", c);
+        } else {
+            DEBUG_PRINTF("\\x%02x", c);
+        }
+    }
+    for (; i <= end; i++) {
+        u64a b_idx = i - ci->buf_offset - 1;
+        u8 c = b_idx >= ci->len ? '?' : ci->buf[b_idx];
+        if (ourisprint(c) && c != '\'') {
+            DEBUG_PRINTF("%c", c);
+        } else {
+            DEBUG_PRINTF("\\x%02x", c);
+        }
+    }
+    DEBUG_PRINTF("'");
+}
+#endif
+
+hwlmcb_rv_t roseDelayRebuildCallback(size_t end, u32 id,
+                                     struct hs_scratch *scratch) {
+    struct RoseContext *tctx = &scratch->tctxt;
+    struct core_info *ci = &scratch->core_info;
+    const struct RoseEngine *t = ci->rose;
+    size_t rb_len = MIN(ci->hlen, t->delayRebuildLength);
+
+    u64a real_end = ci->buf_offset - rb_len + end + 1; // index after last byte
+
+#ifdef DEBUG
+    DEBUG_PRINTF("REBUILD MATCH id=%u end offset@%llu]: ", id, real_end);
+    u64a start = real_end < 8 ? 1 : real_end - 7;
+    printMatch(ci, start, real_end);
+    DEBUG_PRINTF("\n");
+#endif
+
+    DEBUG_PRINTF("STATE groups=0x%016llx\n", tctx->groups);
+
+    assert(id && id < t->size); // id is a program offset
+    const u64a som = 0;
+    const u8 flags = 0;
+    UNUSED hwlmcb_rv_t rv =
+        roseRunProgram(t, scratch, id, som, real_end, flags);
+    assert(rv != HWLM_TERMINATE_MATCHING);
+
+    /* we are just repopulating the delay queue, groups should be
+     * already set from the original scan. */
+
+    return tctx->groups;
+}
+
+static really_inline
+hwlmcb_rv_t ensureMpvQueueFlushed(const struct RoseEngine *t,
+                                  struct hs_scratch *scratch, u32 qi, s64a loc,
+                                  char in_chained) {
+    return ensureQueueFlushed_i(t, scratch, qi, loc, 1, in_chained);
+}
+
+hwlmcb_rv_t roseHandleChainMatch(const struct RoseEngine *t,
+                                 struct hs_scratch *scratch, u32 event,
+                                 u64a top_squash_distance, u64a end,
+                                 char in_catchup) {
+    assert(event == MQE_TOP || event >= MQE_TOP_FIRST);
+    struct core_info *ci = &scratch->core_info;
+
+    u8 *aa = getActiveLeafArray(t, scratch->core_info.state);
+    u32 aaCount = t->activeArrayCount;
+    struct fatbit *activeQueues = scratch->aqa;
+    u32 qCount = t->queueCount;
+
+    const u32 qi = 0; /* MPV is always queue 0 if it exists */
+    struct mq *q = &scratch->queues[qi];
+    const struct NfaInfo *info = getNfaInfoByQueue(t, qi);
+
+    s64a loc = (s64a)end - ci->buf_offset;
+    assert(loc <= (s64a)ci->len && loc >= -(s64a)ci->hlen);
+
+    if (!mmbit_set(aa, aaCount, qi)) {
+        initQueue(q, qi, t, scratch);
+        nfaQueueInitState(q->nfa, q);
+        pushQueueAt(q, 0, MQE_START, loc);
+        fatbit_set(activeQueues, qCount, qi);
+    } else if (info->no_retrigger) {
+        DEBUG_PRINTF("yawn\n");
+        /* nfa only needs one top; we can go home now */
+        return HWLM_CONTINUE_MATCHING;
+    } else if (!fatbit_set(activeQueues, qCount, qi)) {
+        initQueue(q, qi, t, scratch);
+        loadStreamState(q->nfa, q, 0);
+        pushQueueAt(q, 0, MQE_START, 0);
+    } else if (isQueueFull(q)) {
+        DEBUG_PRINTF("queue %u full -> catching up nfas\n", qi);
+        /* we know it is a chained nfa and the suffixes/outfixes must already
+         * be known to be consistent */
+        if (ensureMpvQueueFlushed(t, scratch, qi, loc, in_catchup)
+            == HWLM_TERMINATE_MATCHING) {
+            DEBUG_PRINTF("terminating...\n");
+            return HWLM_TERMINATE_MATCHING;
+        }
+    }
+
+    if (top_squash_distance) {
+        assert(q->cur < q->end);
+        struct mq_item *last = &q->items[q->end - 1];
+        if (last->type == event
+            && last->location >= loc - (s64a)top_squash_distance) {
+            last->location = loc;
+            goto event_enqueued;
+        }
+    }
+
+    pushQueue(q, event, loc);
+
+event_enqueued:
+    if (q_cur_loc(q) == (s64a)ci->len) {
+        /* we may not run the nfa; need to ensure state is fine  */
+        DEBUG_PRINTF("empty run\n");
+        pushQueueNoMerge(q, MQE_END, loc);
+        char alive = nfaQueueExec(q->nfa, q, loc);
+        if (alive) {
+            scratch->tctxt.mpv_inactive = 0;
+            q->cur = q->end = 0;
+            pushQueueAt(q, 0, MQE_START, loc);
+        } else {
+            mmbit_unset(aa, aaCount, qi);
+            fatbit_unset(scratch->aqa, qCount, qi);
+        }
+    }
+
+    DEBUG_PRINTF("added mpv event at %lld\n", loc);
+    scratch->tctxt.next_mpv_offset = 0; /* the top event may result in matches
+                                         * earlier than expected */
+    return HWLM_CONTINUE_MATCHING;
+}
+
+int roseAnchoredCallback(u64a start, u64a end, u32 id, void *ctx) {
+    struct hs_scratch *scratch = ctx;
+    assert(scratch && scratch->magic == SCRATCH_MAGIC);
+    struct RoseContext *tctxt = &scratch->tctxt;
+    struct core_info *ci = &scratch->core_info;
+    const struct RoseEngine *t = ci->rose;
+
+    u64a real_end = ci->buf_offset + end; // index after last byte
+
+    DEBUG_PRINTF("MATCH id=%u offsets=[???,%llu]\n", id, real_end);
+    DEBUG_PRINTF("STATE groups=0x%016llx\n", tctxt->groups);
+
+    if (can_stop_matching(scratch)) {
+        DEBUG_PRINTF("received a match when we're already dead!\n");
+        return MO_HALT_MATCHING;
+    }
+
+    /* delayed literals need to be delivered before real literals; however
+     * delayed literals only come from the floating table so if we are going
+     * to deliver a literal here it must be too early for a delayed literal */
+
+    /* no history checks from anchored region and we are before the flush
+     * boundary */
+
+    if (real_end <= t->floatingMinLiteralMatchOffset) {
+        roseFlushLastByteHistory(t, scratch, real_end);
+        tctxt->lastEndOffset = real_end;
+    }
+
+    // Note that the "id" we have been handed is the program offset.
+    const u8 flags = ROSE_PROG_FLAG_IN_ANCHORED;
+    if (roseRunProgram(t, scratch, id, start, real_end, flags)
+                       == HWLM_TERMINATE_MATCHING) {
+        assert(can_stop_matching(scratch));
+        DEBUG_PRINTF("caller requested termination\n");
+        return MO_HALT_MATCHING;
+    }
+
+    DEBUG_PRINTF("DONE groups=0x%016llx\n", tctxt->groups);
+
+    return MO_CONTINUE_MATCHING;
+}
+
+/**
+ * \brief Run the program for the given literal ID, with the interpreter
+ * inlined into this call.
+ *
+ * Assumes not in_anchored.
+ */
+static really_inline
+hwlmcb_rv_t roseProcessMatchInline(const struct RoseEngine *t,
+                                   struct hs_scratch *scratch, u64a end,
+                                   u32 id) {
+    DEBUG_PRINTF("id=%u\n", id);
+    const u64a som = 0;
+    const u8 flags = 0;
+    if (t->pureLiteral) {
+        return roseRunProgram_l(t, scratch, id, som, end, flags);
+    } else {
+        return roseRunProgram(t, scratch, id, som, end, flags);
+    }
+}
+
+static rose_inline
+hwlmcb_rv_t playDelaySlot(const struct RoseEngine *t,
+                          struct hs_scratch *scratch,
+                          struct fatbit **delaySlots, u32 vicIndex,
+                          u64a offset) {
+    /* assert(!tctxt->in_anchored); */
+    assert(vicIndex < DELAY_SLOT_COUNT);
+    const struct fatbit *vicSlot = delaySlots[vicIndex];
+    u32 delay_count = t->delay_count;
+
+    if (offset < t->floatingMinLiteralMatchOffset) {
+        DEBUG_PRINTF("too soon\n");
+        return HWLM_CONTINUE_MATCHING;
+    }
+
+    struct RoseContext *tctxt = &scratch->tctxt;
+    roseFlushLastByteHistory(t, scratch, offset);
+    tctxt->lastEndOffset = offset;
+
+    const u32 *programs = getByOffset(t, t->delayProgramOffset);
+
+    for (u32 it = fatbit_iterate(vicSlot, delay_count, MMB_INVALID);
+         it != MMB_INVALID; it = fatbit_iterate(vicSlot, delay_count, it)) {
+        UNUSED rose_group old_groups = tctxt->groups;
+
+        DEBUG_PRINTF("DELAYED MATCH id=%u offset=%llu\n", it, offset);
+        const u64a som = 0;
+        const u8 flags = 0;
+        hwlmcb_rv_t rv = roseRunProgram(t, scratch, programs[it], som, offset,
+                                        flags);
+        DEBUG_PRINTF("DONE groups=0x%016llx\n", tctxt->groups);
+
+        /* delayed literals can't safely set groups.
+         * However we may be setting groups that successors already have
+         * worked out that we don't need to match the group */
+        DEBUG_PRINTF("groups in %016llx out %016llx\n", old_groups,
+                     tctxt->groups);
+
+        if (rv == HWLM_TERMINATE_MATCHING) {
+            return HWLM_TERMINATE_MATCHING;
+        }
+    }
+
+    return HWLM_CONTINUE_MATCHING;
+}
+
+static really_inline
+hwlmcb_rv_t flushAnchoredLiteralAtLoc(const struct RoseEngine *t,
+                                      struct hs_scratch *scratch,
+                                      u32 curr_loc) {
+    struct RoseContext *tctxt = &scratch->tctxt;
+    struct fatbit *curr_row = getAnchoredLiteralLog(scratch)[curr_loc - 1];
+    u32 region_width = t->anchored_count;
+
+    const u32 *programs = getByOffset(t, t->anchoredProgramOffset);
+
+    DEBUG_PRINTF("report matches at curr loc\n");
+    for (u32 it = fatbit_iterate(curr_row, region_width, MMB_INVALID);
+         it != MMB_INVALID; it = fatbit_iterate(curr_row, region_width, it)) {
+        DEBUG_PRINTF("it = %u/%u\n", it, region_width);
+
+        rose_group old_groups = tctxt->groups;
+        DEBUG_PRINTF("ANCH REPLAY MATCH id=%u offset=%u\n", it, curr_loc);
+        const u64a som = 0;
+        const u8 flags = 0;
+        hwlmcb_rv_t rv = roseRunProgram(t, scratch, programs[it], som, curr_loc,
+                                        flags);
+        DEBUG_PRINTF("DONE groups=0x%016llx\n", tctxt->groups);
+
+        /* anchored literals can't safely set groups.
+         * However we may be setting groups that successors already
+         * have worked out that we don't need to match the group */
+        DEBUG_PRINTF("groups in %016llx out %016llx\n", old_groups,
+                     tctxt->groups);
+        tctxt->groups &= old_groups;
+
+        if (rv == HWLM_TERMINATE_MATCHING) {
+            return HWLM_TERMINATE_MATCHING;
+        }
+    }
+
+    /* clear row; does not invalidate iteration */
+    bf64_unset(&scratch->al_log_sum, curr_loc - 1);
+
+    return HWLM_CONTINUE_MATCHING;
+}
+
+static really_inline
+u32 anchored_it_begin(struct hs_scratch *scratch) {
+    struct RoseContext *tctxt = &scratch->tctxt;
+    if (tctxt->lastEndOffset >= scratch->anchored_literal_region_len) {
+        return MMB_INVALID;
+    }
+    u32 begin = tctxt->lastEndOffset;
+    begin--;
+
+    return bf64_iterate(scratch->al_log_sum, begin);
+}
+
+static really_inline
+hwlmcb_rv_t flushAnchoredLiterals(const struct RoseEngine *t,
+                                  struct hs_scratch *scratch,
+                                  u32 *anchored_it_param, u64a to_off) {
+    struct RoseContext *tctxt = &scratch->tctxt;
+    u32 anchored_it = *anchored_it_param;
+    /* catch up any remaining anchored matches */
+    for (; anchored_it != MMB_INVALID && anchored_it < to_off;
+         anchored_it = bf64_iterate(scratch->al_log_sum, anchored_it)) {
+        assert(anchored_it < scratch->anchored_literal_region_len);
+        DEBUG_PRINTF("loc_it = %u\n", anchored_it);
+        u32 curr_off = anchored_it + 1;
+        roseFlushLastByteHistory(t, scratch, curr_off);
+        tctxt->lastEndOffset = curr_off;
+
+        if (flushAnchoredLiteralAtLoc(t, scratch, curr_off)
+            == HWLM_TERMINATE_MATCHING) {
+            return HWLM_TERMINATE_MATCHING;
+        }
+    }
+
+    *anchored_it_param = anchored_it;
+    return HWLM_CONTINUE_MATCHING;
+}
+
+static really_inline
+hwlmcb_rv_t playVictims(const struct RoseEngine *t, struct hs_scratch *scratch,
+                        u32 *anchored_it, u64a lastEnd, u64a victimDelaySlots,
+                        struct fatbit **delaySlots) {
+    while (victimDelaySlots) {
+        u32 vic = findAndClearLSB_64(&victimDelaySlots);
+        DEBUG_PRINTF("vic = %u\n", vic);
+        u64a vicOffset = vic + (lastEnd & ~(u64a)DELAY_MASK);
+
+        if (flushAnchoredLiterals(t, scratch, anchored_it, vicOffset)
+            == HWLM_TERMINATE_MATCHING) {
+            return HWLM_TERMINATE_MATCHING;
+        }
+
+        if (playDelaySlot(t, scratch, delaySlots, vic % DELAY_SLOT_COUNT,
+                          vicOffset) == HWLM_TERMINATE_MATCHING) {
+            return HWLM_TERMINATE_MATCHING;
+        }
+    }
+
+    return HWLM_CONTINUE_MATCHING;
+}
+
+/* call flushQueuedLiterals instead */
+hwlmcb_rv_t flushQueuedLiterals_i(const struct RoseEngine *t,
+                                  struct hs_scratch *scratch, u64a currEnd) {
+    struct RoseContext *tctxt = &scratch->tctxt;
+    u64a lastEnd = tctxt->delayLastEndOffset;
+    DEBUG_PRINTF("flushing backed up matches @%llu up from %llu\n", currEnd,
+                 lastEnd);
+
+    assert(currEnd != lastEnd); /* checked in main entry point */
+
+    u32 anchored_it = anchored_it_begin(scratch);
+
+    if (!tctxt->filledDelayedSlots) {
+        DEBUG_PRINTF("no delayed, no flush\n");
+        goto anchored_leftovers;
+    }
+
+    {
+        struct fatbit **delaySlots = getDelaySlots(scratch);
+
+        u32 lastIndex = lastEnd & DELAY_MASK;
+        u32 currIndex = currEnd & DELAY_MASK;
+
+        int wrapped = (lastEnd | DELAY_MASK) < currEnd;
+
+        u64a victimDelaySlots; /* needs to be twice as wide as the number of
+                                * slots. */
+
+        DEBUG_PRINTF("hello %08x\n", tctxt->filledDelayedSlots);
+        if (!wrapped) {
+            victimDelaySlots = tctxt->filledDelayedSlots;
+
+            DEBUG_PRINTF("unwrapped %016llx %08x\n", victimDelaySlots,
+                         tctxt->filledDelayedSlots);
+            /* index vars < 32 so 64bit shifts are safe */
+
+            /* clear all slots at last index and below, */
+            victimDelaySlots &= ~((1LLU << (lastIndex + 1)) - 1);
+
+            /* clear all slots above curr index */
+            victimDelaySlots &= (1LLU << (currIndex + 1)) - 1;
+
+            tctxt->filledDelayedSlots &= ~victimDelaySlots;
+
+            DEBUG_PRINTF("unwrapped %016llx %08x\n", victimDelaySlots,
+                         tctxt->filledDelayedSlots);
+        } else {
+            DEBUG_PRINTF("wrapped %08x\n", tctxt->filledDelayedSlots);
+
+            /* 1st half: clear all slots at last index and below, */
+            u64a first_half = tctxt->filledDelayedSlots;
+            first_half &= ~((1ULL << (lastIndex + 1)) - 1);
+            tctxt->filledDelayedSlots &= (1ULL << (lastIndex + 1)) - 1;
+
+            u64a second_half = tctxt->filledDelayedSlots;
+
+            if (currEnd > lastEnd + DELAY_SLOT_COUNT) {
+                /* 2nd half: clear all slots above last index */
+                second_half &= (1ULL << (lastIndex + 1)) - 1;
+            } else {
+                /* 2nd half: clear all slots above curr index */
+                second_half &= (1ULL << (currIndex + 1)) - 1;
+            }
+            tctxt->filledDelayedSlots &= ~second_half;
+
+            victimDelaySlots = first_half | (second_half << DELAY_SLOT_COUNT);
+
+            DEBUG_PRINTF("-- %016llx %016llx = %016llx (li %u)\n", first_half,
+                         second_half, victimDelaySlots, lastIndex);
+        }
+
+        if (playVictims(t, scratch, &anchored_it, lastEnd, victimDelaySlots,
+                        delaySlots) == HWLM_TERMINATE_MATCHING) {
+            return HWLM_TERMINATE_MATCHING;
+        }
+    }
+
+anchored_leftovers:;
+    hwlmcb_rv_t rv = flushAnchoredLiterals(t, scratch, &anchored_it, currEnd);
+    tctxt->delayLastEndOffset = currEnd;
+    return rv;
+}
+
+static really_inline
+hwlmcb_rv_t roseCallback_i(size_t end, u32 id, struct hs_scratch *scratch) {
+    struct RoseContext *tctx = &scratch->tctxt;
+    const struct RoseEngine *t = scratch->core_info.rose;
+
+    u64a real_end = end + tctx->lit_offset_adjust;
+
+#if defined(DEBUG)
+    DEBUG_PRINTF("MATCH id=%u end offset@%llu: ", id, real_end);
+    u64a start = real_end < 8 ? 1 : real_end - 7;
+    printMatch(&scratch->core_info, start, real_end);
+    printf("\n");
+#endif
+    DEBUG_PRINTF("last end %llu\n", tctx->lastEndOffset);
+
+    DEBUG_PRINTF("STATE groups=0x%016llx\n", tctx->groups);
+
+    if (can_stop_matching(scratch)) {
+        DEBUG_PRINTF("received a match when we're already dead!\n");
+        return HWLM_TERMINATE_MATCHING;
+    }
+
+    hwlmcb_rv_t rv = flushQueuedLiterals(t, scratch, real_end);
+    /* flushDelayed may have advanced tctx->lastEndOffset */
+
+    if (real_end >= t->floatingMinLiteralMatchOffset) {
+        roseFlushLastByteHistory(t, scratch, real_end);
+        tctx->lastEndOffset = real_end;
+    }
+
+    if (rv == HWLM_TERMINATE_MATCHING) {
+        return HWLM_TERMINATE_MATCHING;
+    }
+
+    rv = roseProcessMatchInline(t, scratch, real_end, id);
+
+    DEBUG_PRINTF("DONE groups=0x%016llx\n", tctx->groups);
+
+    if (rv != HWLM_TERMINATE_MATCHING) {
+        return tctx->groups;
+    }
+
+    assert(can_stop_matching(scratch));
+    DEBUG_PRINTF("user requested halt\n");
+    return HWLM_TERMINATE_MATCHING;
+}
+
+hwlmcb_rv_t roseCallback(size_t end, u32 id, struct hs_scratch *scratch) {
+    return roseCallback_i(end, id, scratch);
+}
+
+hwlmcb_rv_t roseFloatingCallback(size_t end, u32 id,
+                                 struct hs_scratch *scratch) {
+    const struct RoseEngine *t = scratch->core_info.rose;
+
+    return roseCallback_i(end, id, scratch) & t->floating_group_mask;
+}
+
+/**
+ * \brief Execute a boundary report program.
+ *
+ * Returns MO_HALT_MATCHING if the stream is exhausted or the user has
+ * instructed us to halt, or MO_CONTINUE_MATCHING otherwise.
+ */
+int roseRunBoundaryProgram(const struct RoseEngine *rose, u32 program,
+                           u64a stream_offset, struct hs_scratch *scratch) {
+    DEBUG_PRINTF("running boundary program at offset %u\n", program);
+
+    if (can_stop_matching(scratch)) {
+        DEBUG_PRINTF("can stop matching\n");
+        return MO_HALT_MATCHING;
+    }
+
+    if (rose->hasSom && scratch->deduper.current_report_offset == ~0ULL) {
+        /* we cannot delay the initialization of the som deduper logs any longer
+         * as we are reporting matches. This is done explicitly as we are
+         * shortcutting the som handling in the vacuous repeats as we know they
+         * all come from non-som patterns. */
+        fatbit_clear(scratch->deduper.som_log[0]);
+        fatbit_clear(scratch->deduper.som_log[1]);
+        scratch->deduper.som_log_dirty = 0;
+    }
+
+    // Keep assertions in program report path happy. At offset zero, there can
+    // have been no earlier reports. At EOD, all earlier reports should have
+    // been handled and we will have been caught up to the stream offset by the
+    // time we are running boundary report programs.
+    scratch->tctxt.minMatchOffset = stream_offset;
+
+    const u64a som = 0;
+    const u8 flags = 0;
+    hwlmcb_rv_t rv = roseRunProgram(rose, scratch, program, som, stream_offset,
+                                    flags);
+    if (rv == HWLM_TERMINATE_MATCHING) {
+        return MO_HALT_MATCHING;
+    }
+
+    return MO_CONTINUE_MATCHING;
+}
+
+/**
+ * \brief Execute a flush combination program.
+ *
+ * Returns MO_HALT_MATCHING if the stream is exhausted or the user has
+ * instructed us to halt, or MO_CONTINUE_MATCHING otherwise.
+ */
+int roseRunFlushCombProgram(const struct RoseEngine *rose,
+                            struct hs_scratch *scratch, u64a end) {
+    hwlmcb_rv_t rv = roseRunProgram(rose, scratch, rose->flushCombProgramOffset,
+                                    0, end, 0);
+    if (rv == HWLM_TERMINATE_MATCHING) {
+        return MO_HALT_MATCHING;
+    }
+    return MO_CONTINUE_MATCHING;
+}
+
+/**
+ * \brief Execute last flush combination program.
+ *
+ * Returns MO_HALT_MATCHING if the stream is exhausted or the user has
+ * instructed us to halt, or MO_CONTINUE_MATCHING otherwise.
+ */
+int roseRunLastFlushCombProgram(const struct RoseEngine *rose,
+                                struct hs_scratch *scratch, u64a end) {
+    hwlmcb_rv_t rv = roseRunProgram(rose, scratch,
+                                    rose->lastFlushCombProgramOffset,
+                                    0, end, 0);
+    if (rv == HWLM_TERMINATE_MATCHING) {
+        return MO_HALT_MATCHING;
+    }
+    return MO_CONTINUE_MATCHING;
+}
+
+int roseReportAdaptor(u64a start, u64a end, ReportID id, void *context) {
+    struct hs_scratch *scratch = context;
+    assert(scratch && scratch->magic == SCRATCH_MAGIC);
+
+    DEBUG_PRINTF("id=%u matched at [%llu,%llu]\n", id, start, end);
+
+    const struct RoseEngine *rose = scratch->core_info.rose;
+
+    // Our match ID is the program offset.
+    const u32 program = id;
+    const u8 flags = ROSE_PROG_FLAG_SKIP_MPV_CATCHUP;
+    hwlmcb_rv_t rv;
+    if (rose->pureLiteral) {
+        rv = roseRunProgram_l(rose, scratch, program, start, end, flags);
+    } else {
+        rv = roseRunProgram(rose, scratch, program, start, end, flags);
+    }
+    if (rv == HWLM_TERMINATE_MATCHING) {
+        return MO_HALT_MATCHING;
+    }
+
+    return can_stop_matching(scratch) ? MO_HALT_MATCHING : MO_CONTINUE_MATCHING;
+}
diff --git a/regex/rose/match.h b/regex/rose/match.h
new file mode 100644
index 000000000..c03b1ebba
--- /dev/null
+++ b/regex/rose/match.h
@@ -0,0 +1,383 @@
+/*
+ * Copyright (c) 2015-2018, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef ROSE_MATCH_H
+#define ROSE_MATCH_H
+
+#include "catchup.h"
+#include "runtime.h"
+#include "scratch.h"
+#include "report.h"
+#include "rose_common.h"
+#include "rose_internal.h"
+#include "ue2common.h"
+#include "hwlm/hwlm.h"
+#include "nfa/nfa_api.h"
+#include "nfa/nfa_api_queue.h"
+#include "nfa/nfa_api_util.h"
+#include "som/som_runtime.h"
+#include "util/bitutils.h"
+#include "util/exhaust.h"
+#include "util/fatbit.h"
+#include "util/multibit.h"
+
+/* Callbacks, defined in catchup.c */
+
+int roseNfaAdaptor(u64a start, u64a end, ReportID id, void *context);
+
+/* Callbacks, defined in match.c */
+
+hwlmcb_rv_t roseCallback(size_t end, u32 id, struct hs_scratch *scratch);
+hwlmcb_rv_t roseFloatingCallback(size_t end, u32 id,
+                                 struct hs_scratch *scratch);
+hwlmcb_rv_t roseDelayRebuildCallback(size_t end, u32 id,
+                                     struct hs_scratch *scratch);
+int roseAnchoredCallback(u64a start, u64a end, u32 id, void *ctx);
+
+/* Common code, used all over Rose runtime */
+
+hwlmcb_rv_t roseHandleChainMatch(const struct RoseEngine *t,
+                                 struct hs_scratch *scratch, u32 event,
+                                 u64a top_squash_distance, u64a end,
+                                 char in_catchup);
+
+/** \brief Initialize the queue for a suffix/outfix engine. */
+static really_inline
+void initQueue(struct mq *q, u32 qi, const struct RoseEngine *t,
+               struct hs_scratch *scratch) {
+    const struct NfaInfo *info = getNfaInfoByQueue(t, qi);
+    assert(scratch->fullState);
+    q->nfa = getNfaByInfo(t, info);
+    q->end = 0;
+    q->cur = 0;
+    q->state = scratch->fullState + info->fullStateOffset;
+    q->streamState = scratch->core_info.state + info->stateOffset;
+    q->offset = scratch->core_info.buf_offset;
+    q->buffer = scratch->core_info.buf;
+    q->length = scratch->core_info.len;
+    q->history = scratch->core_info.hbuf;
+    q->hlength = scratch->core_info.hlen;
+    q->cb = roseNfaAdaptor;
+    q->context = scratch;
+    q->report_current = 0;
+
+    DEBUG_PRINTF("qi=%u, offset=%llu, fullState=%u, streamState=%u, "
+                 "state=%u\n", qi, q->offset, info->fullStateOffset,
+                 info->stateOffset, *(u32 *)q->state);
+}
+
+/** \brief Initialize the queue for a leftfix (prefix/infix) engine. */
+static really_inline
+void initRoseQueue(const struct RoseEngine *t, u32 qi,
+                   const struct LeftNfaInfo *left,
+                   struct hs_scratch *scratch) {
+    struct mq *q = scratch->queues + qi;
+    const struct NfaInfo *info = getNfaInfoByQueue(t, qi);
+    q->nfa = getNfaByInfo(t, info);
+    q->end = 0;
+    q->cur = 0;
+    q->state = scratch->fullState + info->fullStateOffset;
+
+    // Transient roses don't have stream state, we use tstate in scratch
+    // instead. The only reason we need this at ALL is for LimEx extended
+    // regions, which assume that they have access to q->streamState +
+    // compressedStateSize.
+    if (left->transient) {
+        q->streamState = (char *)scratch->tstate + info->stateOffset;
+    } else {
+        q->streamState = scratch->core_info.state + info->stateOffset;
+    }
+
+    q->offset = scratch->core_info.buf_offset;
+    q->buffer = scratch->core_info.buf;
+    q->length = scratch->core_info.len;
+    q->history = scratch->core_info.hbuf;
+    q->hlength = scratch->core_info.hlen;
+    q->cb = NULL;
+    q->context = NULL;
+    q->report_current = 0;
+
+    DEBUG_PRINTF("qi=%u, offset=%llu, fullState=%u, streamState=%u, "
+                 "state=%u\n", qi, q->offset, info->fullStateOffset,
+                 info->stateOffset, *(u32 *)q->state);
+}
+
+/** returns 0 if space for two items (top and end) on the queue */
+static really_inline
+char isQueueFull(const struct mq *q) {
+    return q->end + 2 > MAX_MQE_LEN;
+}
+
+static really_inline
+void loadStreamState(const struct NFA *nfa, struct mq *q, s64a loc) {
+    DEBUG_PRINTF("offset=%llu, length=%zu, hlength=%zu, loc=%lld\n",
+                 q->offset, q->length, q->hlength, loc);
+    nfaExpandState(nfa, q->state, q->streamState, q->offset + loc,
+                   queue_prev_byte(q, loc));
+}
+
+static really_inline
+void storeRoseDelay(const struct RoseEngine *t, char *state,
+                    const struct LeftNfaInfo *left, u32 loc) {
+    u32 di = left->lagIndex;
+    if (di == ROSE_OFFSET_INVALID) {
+        return;
+    }
+
+    assert(loc < 256); // ONE WHOLE BYTE!
+    DEBUG_PRINTF("storing rose delay %u in slot %u\n", loc, di);
+    u8 *leftfixDelay = getLeftfixLagTable(t, state);
+    assert(loc <= MAX_STORED_LEFTFIX_LAG);
+    leftfixDelay[di] = loc;
+}
+
+static really_inline
+void setAsZombie(const struct RoseEngine *t, char *state,
+                 const struct LeftNfaInfo *left) {
+    u32 di = left->lagIndex;
+    assert(di != ROSE_OFFSET_INVALID);
+    if (di == ROSE_OFFSET_INVALID) {
+        return;
+    }
+
+    u8 *leftfixDelay = getLeftfixLagTable(t, state);
+    leftfixDelay[di] = OWB_ZOMBIE_ALWAYS_YES;
+}
+
+/* loadRoseDelay MUST NOT be called on the first stream write as it is only
+ * initialized for running nfas on stream boundaries */
+static really_inline
+u32 loadRoseDelay(const struct RoseEngine *t, const char *state,
+                  const struct LeftNfaInfo *left) {
+    u32 di = left->lagIndex;
+    if (di == ROSE_OFFSET_INVALID) {
+        return 0;
+    }
+
+    const u8 *leftfixDelay = getLeftfixLagTableConst(t, state);
+    u32 loc = leftfixDelay[di];
+    DEBUG_PRINTF("read rose delay %u from slot %u\n", loc, di);
+    return loc;
+}
+
+static really_inline
+char isZombie(const struct RoseEngine *t, const char *state,
+              const struct LeftNfaInfo *left) {
+    u32 di = left->lagIndex;
+    assert(di != ROSE_OFFSET_INVALID);
+    if (di == ROSE_OFFSET_INVALID) {
+        return 0;
+    }
+
+    const u8 *leftfixDelay = getLeftfixLagTableConst(t, state);
+    DEBUG_PRINTF("read owb %hhu from slot %u\n", leftfixDelay[di], di);
+    return leftfixDelay[di] == OWB_ZOMBIE_ALWAYS_YES;
+}
+
+hwlmcb_rv_t flushQueuedLiterals_i(const struct RoseEngine *t,
+                                  struct hs_scratch *scratch, u64a end);
+
+static really_inline
+hwlmcb_rv_t flushQueuedLiterals(const struct RoseEngine *t,
+                                struct hs_scratch *scratch, u64a end) {
+    struct RoseContext *tctxt = &scratch->tctxt;
+
+    if (tctxt->delayLastEndOffset == end) {
+        DEBUG_PRINTF("no progress, no flush\n");
+        return HWLM_CONTINUE_MATCHING;
+    }
+
+    if (!tctxt->filledDelayedSlots && !scratch->al_log_sum) {
+        tctxt->delayLastEndOffset = end;
+        return HWLM_CONTINUE_MATCHING;
+    }
+
+    return flushQueuedLiterals_i(t, scratch, end);
+}
+
+static really_inline
+hwlmcb_rv_t cleanUpDelayed(const struct RoseEngine *t,
+                           struct hs_scratch *scratch, size_t length,
+                           u64a offset) {
+    if (can_stop_matching(scratch)) {
+        return HWLM_TERMINATE_MATCHING;
+    }
+
+    if (flushQueuedLiterals(t, scratch, length + offset)
+        == HWLM_TERMINATE_MATCHING) {
+        return HWLM_TERMINATE_MATCHING;
+    }
+
+    struct RoseContext *tctxt = &scratch->tctxt;
+    if (tctxt->filledDelayedSlots) {
+        DEBUG_PRINTF("dirty\n");
+        scratch->core_info.status |= STATUS_DELAY_DIRTY;
+    } else {
+        scratch->core_info.status &= ~STATUS_DELAY_DIRTY;
+    }
+
+    tctxt->filledDelayedSlots = 0;
+    tctxt->delayLastEndOffset = offset;
+
+    return HWLM_CONTINUE_MATCHING;
+}
+
+static rose_inline
+void roseFlushLastByteHistory(const struct RoseEngine *t,
+                              struct hs_scratch *scratch, u64a currEnd) {
+    if (!t->lastByteHistoryIterOffset) {
+        return;
+    }
+
+    struct RoseContext *tctxt = &scratch->tctxt;
+    struct core_info *ci = &scratch->core_info;
+
+    /* currEnd is last byte of string + 1 */
+    if (tctxt->lastEndOffset == ci->buf_offset + ci->len
+        || currEnd != ci->buf_offset + ci->len) {
+        /* already flushed or it is not yet time to flush */
+        return;
+    }
+
+    DEBUG_PRINTF("flushing\n");
+
+    const struct mmbit_sparse_iter *it =
+        getByOffset(t, t->lastByteHistoryIterOffset);
+    assert(ISALIGNED(it));
+
+    const u32 numStates = t->rolesWithStateCount;
+    void *role_state = getRoleState(scratch->core_info.state);
+
+    struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES];
+
+    mmbit_sparse_iter_unset(role_state, numStates, it, si_state);
+}
+
+static rose_inline
+int roseHasInFlightMatches(const struct RoseEngine *t, char *state,
+                           const struct hs_scratch *scratch) {
+    if (scratch->al_log_sum) {
+        DEBUG_PRINTF("anchored literals in log\n");
+        return 1;
+    }
+
+    if (scratch->tctxt.filledDelayedSlots) {
+        DEBUG_PRINTF("delayed literal\n");
+        return 1;
+    }
+
+    if (mmbit_any(getRoleState(state), t->rolesWithStateCount)) {
+        DEBUG_PRINTF("role state is set\n");
+        return 1;
+    }
+
+    return 0;
+}
+
+static rose_inline
+hwlmcb_rv_t roseHaltIfExhausted(const struct RoseEngine *t,
+                                struct hs_scratch *scratch) {
+    struct core_info *ci = &scratch->core_info;
+    if (isAllExhausted(t, ci->exhaustionVector)) {
+        ci->status |= STATUS_EXHAUSTED;
+        scratch->tctxt.groups = 0;
+        DEBUG_PRINTF("all exhausted, termination requested\n");
+        return HWLM_TERMINATE_MATCHING;
+    }
+
+    return HWLM_CONTINUE_MATCHING;
+}
+
+static really_inline
+hwlmcb_rv_t ensureQueueFlushed_i(const struct RoseEngine *t,
+                                 struct hs_scratch *scratch, u32 qi, s64a loc,
+                                 char is_mpv, char in_catchup) {
+    struct RoseContext *tctxt = &scratch->tctxt;
+    u8 *aa = getActiveLeafArray(t, scratch->core_info.state);
+    struct fatbit *activeQueues = scratch->aqa;
+    u32 aaCount = t->activeArrayCount;
+    u32 qCount = t->queueCount;
+
+    struct mq *q = &scratch->queues[qi];
+    DEBUG_PRINTF("qcl %lld, loc: %lld, min (non mpv) match offset: %llu\n",
+                 q_cur_loc(q), loc, tctxt->minNonMpvMatchOffset);
+    if (q_cur_loc(q) == loc) {
+        /* too many tops enqueued at the one spot; need to flatten this queue.
+         * We can use the full catchups as it will short circuit as we are
+         * already at this location. It also saves waking everybody up */
+        pushQueueNoMerge(q, MQE_END, loc);
+        nfaQueueExec(q->nfa, q, loc);
+        q->cur = q->end = 0;
+        pushQueueAt(q, 0, MQE_START, loc);
+    } else if (!in_catchup) {
+        if (is_mpv) {
+            tctxt->next_mpv_offset = 0; /* force us to catch the mpv */
+            if (loc + scratch->core_info.buf_offset
+                <= tctxt->minNonMpvMatchOffset) {
+                DEBUG_PRINTF("flushing chained\n");
+                if (roseCatchUpMPV(t, loc, scratch) ==
+                    HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                goto done_queue_empty;
+            }
+        }
+
+        if (roseCatchUpTo(t, scratch, loc + scratch->core_info.buf_offset) ==
+            HWLM_TERMINATE_MATCHING) {
+            return HWLM_TERMINATE_MATCHING;
+        }
+    } else {
+        /* we must be a chained nfa */
+        assert(is_mpv);
+        DEBUG_PRINTF("flushing chained\n");
+        tctxt->next_mpv_offset = 0; /* force us to catch the mpv */
+        if (roseCatchUpMPV(t, loc, scratch) == HWLM_TERMINATE_MATCHING) {
+            return HWLM_TERMINATE_MATCHING;
+        }
+    }
+done_queue_empty:
+    if (!mmbit_set(aa, aaCount, qi)) {
+        initQueue(q, qi, t, scratch);
+        nfaQueueInitState(q->nfa, q);
+        pushQueueAt(q, 0, MQE_START, loc);
+        fatbit_set(activeQueues, qCount, qi);
+    }
+
+    assert(!isQueueFull(q));
+
+    return roseHaltIfExhausted(t, scratch);
+}
+
+static rose_inline
+hwlmcb_rv_t ensureQueueFlushed(const struct RoseEngine *t,
+                               struct hs_scratch *scratch, u32 qi, s64a loc) {
+    return ensureQueueFlushed_i(t, scratch, qi, loc, 0, 0);
+}
+
+#endif
diff --git a/regex/rose/miracle.h b/regex/rose/miracle.h
new file mode 100644
index 000000000..604c50205
--- /dev/null
+++ b/regex/rose/miracle.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef ROSE_MIRACLE_H
+#define ROSE_MIRACLE_H
+
+#include "ue2common.h"
+#include "runtime.h"
+#include "rose_internal.h"
+
+/** \brief Maximum number of bytes to scan when looking for a "miracle" stop
+ * character. */
+#define MIRACLE_LEN_MAX 32
+
+static really_inline
+u64a roseMiracleScan(const u8 *stop, const u8 *d, const u8 *d_start) {
+    assert(d >= d_start);
+
+    // Note: unrolling this loop manually does appear to reduce its
+    // performance. I'm sick of tilting at this particular windmill.
+
+    u32 mshift = 0;
+    do {
+        u64a s = (u64a)stop[*d];
+        if (s) {
+            s <<= mshift;
+            return s;
+        }
+        mshift++;
+    } while (--d >= d_start);
+    return 0;
+}
+
+/**
+ * \brief "Miracle" scan: uses stop table to check if we can skip forward to a
+ * location where we know that the given rose engine will be in a known state.
+ *
+ * Scans the buffer/history between relative locations \a begin_loc and \a
+ * end_loc, and returns a miracle location (if any) that appears in the stream
+ * after \a begin_loc.
+ *
+ * Returns 1 if some bytes can be skipped and sets \a miracle_loc
+ * appropriately, 0 otherwise.
+ */
+static rose_inline
+char roseMiracleOccurs(const struct RoseEngine *t,
+                       const struct LeftNfaInfo *left,
+                       const struct core_info *ci, const s64a begin_loc,
+                       const s64a end_loc, s64a *miracle_loc) {
+    assert(!left->transient);
+    assert(left->stopTable);
+
+    DEBUG_PRINTF("looking for miracle over [%lld,%lld], maxLag=%u\n",
+                 begin_loc, end_loc, left->maxLag);
+    DEBUG_PRINTF("ci->len=%zu, ci->hlen=%zu\n", ci->len, ci->hlen);
+
+    assert(begin_loc <= end_loc);
+    assert(begin_loc >= -(s64a)ci->hlen);
+    assert(end_loc <= (s64a)ci->len);
+
+    const u8 *stop = getByOffset(t, left->stopTable);
+
+    const s64a scan_end_loc = end_loc - left->maxLag;
+    if (scan_end_loc <= begin_loc) {
+        DEBUG_PRINTF("nothing to scan\n");
+        return 0;
+    }
+
+    const s64a start = MAX(begin_loc, scan_end_loc - MIRACLE_LEN_MAX);
+    DEBUG_PRINTF("scan [%lld..%lld]\n", start, scan_end_loc);
+
+    u64a s = 0; // state, on bits are miracle locations
+
+    // Scan buffer.
+    const s64a buf_scan_start = MAX(0, start);
+    if (scan_end_loc > buf_scan_start) {
+        const u8 *buf = ci->buf;
+        const u8 *d = buf + scan_end_loc - 1;
+        const u8 *d_start = buf + buf_scan_start;
+        s = roseMiracleScan(stop, d, d_start);
+        if (s) {
+            goto miracle_found;
+        }
+    }
+
+    // Scan history.
+    if (start < 0) {
+        const u8 *hbuf_end = ci->hbuf + ci->hlen;
+        const u8 *d = hbuf_end + MIN(0, scan_end_loc) - 1;
+        const u8 *d_start = hbuf_end + start;
+        s = roseMiracleScan(stop, d, d_start);
+        if (scan_end_loc > 0) {
+            // Shift s over to account for the buffer scan above.
+            s <<= scan_end_loc;
+        }
+    }
+
+    if (s) {
+    miracle_found:
+        DEBUG_PRINTF("s=0x%llx, ctz=%u\n", s, ctz64(s));
+        s64a loc = end_loc - left->maxLag - ctz64(s) - 1;
+        if (loc > begin_loc) {
+            DEBUG_PRINTF("miracle at %lld\n", loc);
+            *miracle_loc = loc;
+            return 1;
+        }
+    }
+
+    DEBUG_PRINTF("no viable miraculous stop characters found\n");
+    return 0;
+}
+
+#endif // ROSE_MIRACLE_H
diff --git a/regex/rose/program_runtime.c b/regex/rose/program_runtime.c
new file mode 100644
index 000000000..e6d1d5ae3
--- /dev/null
+++ b/regex/rose/program_runtime.c
@@ -0,0 +1,3509 @@
+/*
+ * Copyright (c) 2015-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file
+ * \brief Rose runtime: program interpreter.
+ */
+
+#include "program_runtime.h"
+
+#include "catchup.h"
+#include "counting_miracle.h"
+#include "infix.h"
+#include "match.h"
+#include "miracle.h"
+#include "report.h"
+#include "rose_common.h"
+#include "rose_internal.h"
+#include "rose_program.h"
+#include "rose_types.h"
+#include "validate_mask.h"
+#include "validate_shufti.h"
+#include "runtime.h"
+#include "util/compare.h"
+#include "util/copybytes.h"
+#include "util/fatbit.h"
+#include "util/multibit.h"
+
+/* Inline implementation follows. */
+
+static rose_inline
+void rosePushDelayedMatch(const struct RoseEngine *t,
+                          struct hs_scratch *scratch, u32 delay,
+                          u32 delay_index, u64a offset) {
+    assert(delay);
+
+    const u32 src_slot_index = delay;
+    u32 slot_index = (src_slot_index + offset) & DELAY_MASK;
+
+    struct RoseContext *tctxt = &scratch->tctxt;
+    if (offset + src_slot_index <= tctxt->delayLastEndOffset) {
+        DEBUG_PRINTF("skip too late\n");
+        return;
+    }
+
+    const u32 delay_count = t->delay_count;
+    struct fatbit **delaySlots = getDelaySlots(scratch);
+    struct fatbit *slot = delaySlots[slot_index];
+
+    DEBUG_PRINTF("pushing tab %u into slot %u\n", delay_index, slot_index);
+    if (!(tctxt->filledDelayedSlots & (1U << slot_index))) {
+        tctxt->filledDelayedSlots |= 1U << slot_index;
+        fatbit_clear(slot);
+    }
+
+    fatbit_set(slot, delay_count, delay_index);
+}
+
+static rose_inline
+void recordAnchoredLiteralMatch(const struct RoseEngine *t,
+                                struct hs_scratch *scratch, u32 anch_id,
+                                u64a end) {
+    assert(end);
+
+    if (end <= t->floatingMinLiteralMatchOffset) {
+        return;
+    }
+
+    struct fatbit **anchoredLiteralRows = getAnchoredLiteralLog(scratch);
+
+    DEBUG_PRINTF("record %u (of %u) @ %llu\n", anch_id, t->anchored_count, end);
+
+    if (!bf64_set(&scratch->al_log_sum, end - 1)) {
+        // first time, clear row
+        DEBUG_PRINTF("clearing %llu/%u\n", end - 1, t->anchored_count);
+        fatbit_clear(anchoredLiteralRows[end - 1]);
+    }
+
+    assert(anch_id < t->anchored_count);
+    fatbit_set(anchoredLiteralRows[end - 1], t->anchored_count, anch_id);
+}
+
+static rose_inline
+char roseLeftfixCheckMiracles(const struct RoseEngine *t,
+                              const struct LeftNfaInfo *left,
+                              struct core_info *ci, struct mq *q, u64a end,
+                              const char is_infix) {
+    if (!is_infix && left->transient) {
+        // Miracles won't help us with transient leftfix engines; they only
+        // scan for a limited time anyway.
+        return 1;
+    }
+
+    if (!left->stopTable) {
+        return 1;
+    }
+
+    DEBUG_PRINTF("looking for miracle on queue %u\n", q->nfa->queueIndex);
+
+    const s64a begin_loc = q_cur_loc(q);
+    const s64a end_loc = end - ci->buf_offset;
+
+    s64a miracle_loc;
+    if (roseMiracleOccurs(t, left, ci, begin_loc, end_loc, &miracle_loc)) {
+        goto found_miracle;
+    }
+
+    if (roseCountingMiracleOccurs(t, left, ci, begin_loc, end_loc,
+                                  &miracle_loc)) {
+        goto found_miracle;
+    }
+
+    return 1;
+
+found_miracle:
+    DEBUG_PRINTF("miracle at %lld\n", miracle_loc);
+    assert(miracle_loc >= begin_loc);
+
+    // If we're a prefix, then a miracle effectively results in us needing to
+    // re-init our state and start fresh.
+    if (!is_infix) {
+        if (miracle_loc != begin_loc) {
+            DEBUG_PRINTF("re-init prefix state\n");
+            q->cur = q->end = 0;
+            pushQueueAt(q, 0, MQE_START, miracle_loc);
+            pushQueueAt(q, 1, MQE_TOP, miracle_loc);
+            nfaQueueInitState(q->nfa, q);
+        }
+        return 1;
+    }
+
+    // Otherwise, we're an infix. Remove tops before the miracle from the queue
+    // and re-init at that location.
+
+    q_skip_forward_to(q, miracle_loc);
+
+    if (q_last_type(q) == MQE_START) {
+        DEBUG_PRINTF("miracle caused infix to die\n");
+        return 0;
+    }
+
+    DEBUG_PRINTF("re-init infix state\n");
+    assert(q->items[q->cur].type == MQE_START);
+    q->items[q->cur].location = miracle_loc;
+    nfaQueueInitState(q->nfa, q);
+
+    return 1;
+}
+
+static rose_inline
+hwlmcb_rv_t roseTriggerSuffix(const struct RoseEngine *t,
+                              struct hs_scratch *scratch, u32 qi, u32 top,
+                              u64a som, u64a end) {
+    DEBUG_PRINTF("suffix qi=%u, top event=%u\n", qi, top);
+
+    struct core_info *ci = &scratch->core_info;
+    u8 *aa = getActiveLeafArray(t, ci->state);
+    const u32 aaCount = t->activeArrayCount;
+    const u32 qCount = t->queueCount;
+    struct mq *q = &scratch->queues[qi];
+    const struct NfaInfo *info = getNfaInfoByQueue(t, qi);
+    const struct NFA *nfa = getNfaByInfo(t, info);
+
+    s64a loc = (s64a)end - ci->buf_offset;
+    assert(loc <= (s64a)ci->len && loc >= -(s64a)ci->hlen);
+
+    if (!mmbit_set(aa, aaCount, qi)) {
+        initQueue(q, qi, t, scratch);
+        nfaQueueInitState(nfa, q);
+        pushQueueAt(q, 0, MQE_START, loc);
+        fatbit_set(scratch->aqa, qCount, qi);
+    } else if (info->no_retrigger) {
+        DEBUG_PRINTF("yawn\n");
+        /* nfa only needs one top; we can go home now */
+        return HWLM_CONTINUE_MATCHING;
+    } else if (!fatbit_set(scratch->aqa, qCount, qi)) {
+        initQueue(q, qi, t, scratch);
+        loadStreamState(nfa, q, 0);
+        pushQueueAt(q, 0, MQE_START, 0);
+    } else if (isQueueFull(q)) {
+        DEBUG_PRINTF("queue %u full -> catching up nfas\n", qi);
+        if (info->eod) {
+            /* can catch up suffix independently no pq */
+            q->context = NULL;
+            pushQueueNoMerge(q, MQE_END, loc);
+            nfaQueueExecRose(q->nfa, q, MO_INVALID_IDX);
+            q->cur = q->end = 0;
+            pushQueueAt(q, 0, MQE_START, loc);
+        } else if (ensureQueueFlushed(t, scratch, qi, loc)
+            == HWLM_TERMINATE_MATCHING) {
+            return HWLM_TERMINATE_MATCHING;
+        }
+    }
+
+    assert(top == MQE_TOP || (top >= MQE_TOP_FIRST && top < MQE_INVALID));
+    pushQueueSom(q, top, loc, som);
+
+    if (q_cur_loc(q) == (s64a)ci->len && !info->eod) {
+        /* we may not run the nfa; need to ensure state is fine  */
+        DEBUG_PRINTF("empty run\n");
+        pushQueueNoMerge(q, MQE_END, loc);
+        char alive = nfaQueueExec(nfa, q, loc);
+        if (alive) {
+            q->cur = q->end = 0;
+            pushQueueAt(q, 0, MQE_START, loc);
+        } else {
+            mmbit_unset(aa, aaCount, qi);
+            fatbit_unset(scratch->aqa, qCount, qi);
+        }
+    }
+
+    return HWLM_CONTINUE_MATCHING;
+}
+
+static really_inline
+char roseTestLeftfix(const struct RoseEngine *t, struct hs_scratch *scratch,
+                     u32 qi, u32 leftfixLag, ReportID leftfixReport, u64a end,
+                     const char is_infix) {
+    struct core_info *ci = &scratch->core_info;
+
+    u32 ri = queueToLeftIndex(t, qi);
+    const struct LeftNfaInfo *left = getLeftTable(t) + ri;
+
+    DEBUG_PRINTF("testing %s %s %u/%u with lag %u (maxLag=%u)\n",
+                 (left->transient ? "transient" : "active"),
+                 (is_infix ? "infix" : "prefix"),
+                 ri, qi, leftfixLag, left->maxLag);
+
+    assert(leftfixLag <= left->maxLag);
+    assert(left->infix == is_infix);
+    assert(!is_infix || !left->transient); // Only prefixes can be transient.
+
+    struct mq *q = scratch->queues + qi;
+    char *state = scratch->core_info.state;
+    u8 *activeLeftArray = getActiveLeftArray(t, state);
+    u32 qCount = t->queueCount;
+    u32 arCount = t->activeLeftCount;
+
+    if (!mmbit_isset(activeLeftArray, arCount, ri)) {
+        DEBUG_PRINTF("engine is dead nothing to see here\n");
+        return 0;
+    }
+
+    if (unlikely(end < leftfixLag)) {
+        assert(0); /* lag is the literal length */
+        return 0;
+    }
+
+    if (nfaSupportsZombie(getNfaByQueue(t, qi)) && ci->buf_offset
+        && !fatbit_isset(scratch->aqa, qCount, qi)
+        && isZombie(t, state, left)) {
+        DEBUG_PRINTF("zombie\n");
+        return 1;
+    }
+
+    if (!fatbit_set(scratch->aqa, qCount, qi)) {
+        DEBUG_PRINTF("initing q %u\n", qi);
+        initRoseQueue(t, qi, left, scratch);
+        if (ci->buf_offset) { // there have been writes before us!
+            s32 sp;
+            if (!is_infix && left->transient) {
+                sp = -(s32)ci->hlen;
+            } else {
+                sp = -(s32)loadRoseDelay(t, state, left);
+            }
+
+            /* transient nfas are always started fresh -> state not maintained
+             * at stream boundary */
+
+            pushQueueAt(q, 0, MQE_START, sp);
+            if (is_infix || (ci->buf_offset + sp > 0 && !left->transient)) {
+                loadStreamState(q->nfa, q, sp);
+            } else {
+                pushQueueAt(q, 1, MQE_TOP, sp);
+                nfaQueueInitState(q->nfa, q);
+            }
+        } else { // first write ever
+            pushQueueAt(q, 0, MQE_START, 0);
+            pushQueueAt(q, 1, MQE_TOP, 0);
+            nfaQueueInitState(q->nfa, q);
+        }
+    }
+
+    s64a loc = (s64a)end - ci->buf_offset - leftfixLag;
+    assert(loc >= q_cur_loc(q) || left->eager);
+    assert(leftfixReport != MO_INVALID_IDX);
+
+    if (!is_infix && left->transient) {
+        s64a start_loc = loc - left->transient;
+        if (q_cur_loc(q) < start_loc) {
+            q->cur = q->end = 0;
+            pushQueueAt(q, 0, MQE_START, start_loc);
+            pushQueueAt(q, 1, MQE_TOP, start_loc);
+            nfaQueueInitState(q->nfa, q);
+        }
+    }
+
+    if (q_cur_loc(q) < loc || q_last_type(q) != MQE_START) {
+        if (is_infix) {
+            if (infixTooOld(q, loc)) {
+                DEBUG_PRINTF("infix %u died of old age\n", ri);
+                goto nfa_dead;
+            }
+
+            reduceInfixQueue(q, loc, left->maxQueueLen, q->nfa->maxWidth);
+        }
+
+        if (!roseLeftfixCheckMiracles(t, left, ci, q, end, is_infix)) {
+            DEBUG_PRINTF("leftfix %u died due to miracle\n", ri);
+            goto nfa_dead;
+        }
+
+#ifdef DEBUG
+        debugQueue(q);
+#endif
+
+        pushQueueNoMerge(q, MQE_END, loc);
+
+        char rv = nfaQueueExecRose(q->nfa, q, leftfixReport);
+        if (!rv) { /* nfa is dead */
+            DEBUG_PRINTF("leftfix %u died while trying to catch up\n", ri);
+            goto nfa_dead;
+        }
+
+        // Queue must have next start loc before we call nfaInAcceptState.
+        q->cur = q->end = 0;
+        pushQueueAt(q, 0, MQE_START, loc);
+
+        DEBUG_PRINTF("checking for report %u\n", leftfixReport);
+        DEBUG_PRINTF("leftfix done %hhd\n", (signed char)rv);
+        return rv == MO_MATCHES_PENDING;
+    } else if (q_cur_loc(q) > loc) {
+        /* an eager leftfix may have already progressed past loc if there is no
+         * match at loc. */
+        assert(left->eager);
+        return 0;
+    } else {
+        assert(q_cur_loc(q) == loc);
+        DEBUG_PRINTF("checking for report %u\n", leftfixReport);
+        char rv = nfaInAcceptState(q->nfa, leftfixReport, q);
+        DEBUG_PRINTF("leftfix done %hhd\n", (signed char)rv);
+        return rv;
+    }
+
+nfa_dead:
+    mmbit_unset(activeLeftArray, arCount, ri);
+    scratch->tctxt.groups &= left->squash_mask;
+    return 0;
+}
+
+static rose_inline
+char roseTestPrefix(const struct RoseEngine *t, struct hs_scratch *scratch,
+                    u32 qi, u32 leftfixLag, ReportID leftfixReport, u64a end) {
+    return roseTestLeftfix(t, scratch, qi, leftfixLag, leftfixReport, end, 0);
+}
+
+static rose_inline
+char roseTestInfix(const struct RoseEngine *t, struct hs_scratch *scratch,
+                   u32 qi, u32 leftfixLag, ReportID leftfixReport, u64a end) {
+    return roseTestLeftfix(t, scratch, qi, leftfixLag, leftfixReport, end, 1);
+}
+
+static rose_inline
+void roseTriggerInfix(const struct RoseEngine *t, struct hs_scratch *scratch,
+                      u64a start, u64a end, u32 qi, u32 topEvent, u8 cancel) {
+    struct core_info *ci = &scratch->core_info;
+    s64a loc = (s64a)end - ci->buf_offset;
+
+    u32 ri = queueToLeftIndex(t, qi);
+    assert(topEvent < MQE_INVALID);
+
+    const struct LeftNfaInfo *left = getLeftInfoByQueue(t, qi);
+    assert(!left->transient);
+
+    DEBUG_PRINTF("rose %u (qi=%u) event %u\n", ri, qi, topEvent);
+
+    struct mq *q = scratch->queues + qi;
+    const struct NfaInfo *info = getNfaInfoByQueue(t, qi);
+
+    char *state = ci->state;
+    u8 *activeLeftArray = getActiveLeftArray(t, state);
+    const u32 arCount = t->activeLeftCount;
+    char alive = mmbit_set(activeLeftArray, arCount, ri);
+
+    if (alive && info->no_retrigger) {
+        DEBUG_PRINTF("yawn\n");
+        return;
+    }
+
+    struct fatbit *aqa = scratch->aqa;
+    const u32 qCount = t->queueCount;
+
+    if (alive && nfaSupportsZombie(getNfaByInfo(t, info)) && ci->buf_offset &&
+        !fatbit_isset(aqa, qCount, qi) && isZombie(t, state, left)) {
+        DEBUG_PRINTF("yawn - zombie\n");
+        return;
+    }
+
+    if (cancel) {
+        DEBUG_PRINTF("dominating top: (re)init\n");
+        fatbit_set(aqa, qCount, qi);
+        initRoseQueue(t, qi, left, scratch);
+        pushQueueAt(q, 0, MQE_START, loc);
+        nfaQueueInitState(q->nfa, q);
+    } else if (!fatbit_set(aqa, qCount, qi)) {
+        DEBUG_PRINTF("initing %u\n", qi);
+        initRoseQueue(t, qi, left, scratch);
+        if (alive) {
+            s32 sp = -(s32)loadRoseDelay(t, state, left);
+            pushQueueAt(q, 0, MQE_START, sp);
+            loadStreamState(q->nfa, q, sp);
+        } else {
+            pushQueueAt(q, 0, MQE_START, loc);
+            nfaQueueInitState(q->nfa, q);
+        }
+    } else if (!alive) {
+        q->cur = q->end = 0;
+        pushQueueAt(q, 0, MQE_START, loc);
+        nfaQueueInitState(q->nfa, q);
+    } else if (isQueueFull(q)) {
+        reduceInfixQueue(q, loc, left->maxQueueLen, q->nfa->maxWidth);
+
+        if (isQueueFull(q)) {
+            /* still full - reduceInfixQueue did nothing */
+            DEBUG_PRINTF("queue %u full (%u items) -> catching up nfa\n", qi,
+                         q->end - q->cur);
+            pushQueueNoMerge(q, MQE_END, loc);
+            nfaQueueExecRose(q->nfa, q, MO_INVALID_IDX);
+
+            q->cur = q->end = 0;
+            pushQueueAt(q, 0, MQE_START, loc);
+        }
+    }
+
+    pushQueueSom(q, topEvent, loc, start);
+}
+
+static rose_inline
+hwlmcb_rv_t roseReport(const struct RoseEngine *t, struct hs_scratch *scratch,
+                       u64a end, ReportID onmatch, s32 offset_adjust,
+                       u32 ekey) {
+    DEBUG_PRINTF("firing callback onmatch=%u, end=%llu\n", onmatch, end);
+    updateLastMatchOffset(&scratch->tctxt, end);
+
+    int cb_rv = roseDeliverReport(end, onmatch, offset_adjust, scratch, ekey);
+    if (cb_rv == MO_HALT_MATCHING) {
+        DEBUG_PRINTF("termination requested\n");
+        return HWLM_TERMINATE_MATCHING;
+    }
+
+    if (ekey == INVALID_EKEY || cb_rv == ROSE_CONTINUE_MATCHING_NO_EXHAUST) {
+        return HWLM_CONTINUE_MATCHING;
+    }
+
+    return roseHaltIfExhausted(t, scratch);
+}
+
+static rose_inline
+hwlmcb_rv_t roseReportComb(const struct RoseEngine *t,
+                           struct hs_scratch *scratch, u64a end,
+                           ReportID onmatch, s32 offset_adjust, u32 ekey) {
+    DEBUG_PRINTF("firing callback onmatch=%u, end=%llu\n", onmatch, end);
+
+    int cb_rv = roseDeliverReport(end, onmatch, offset_adjust, scratch, ekey);
+    if (cb_rv == MO_HALT_MATCHING) {
+        DEBUG_PRINTF("termination requested\n");
+        return HWLM_TERMINATE_MATCHING;
+    }
+
+    if (ekey == INVALID_EKEY || cb_rv == ROSE_CONTINUE_MATCHING_NO_EXHAUST) {
+        return HWLM_CONTINUE_MATCHING;
+    }
+
+    return roseHaltIfExhausted(t, scratch);
+}
+
+/* catches up engines enough to ensure any earlier mpv triggers are enqueued
+ * and then adds the trigger to the mpv queue. */
+static rose_inline
+hwlmcb_rv_t roseCatchUpAndHandleChainMatch(const struct RoseEngine *t,
+                                           struct hs_scratch *scratch,
+                                           u32 event, u64a top_squash_distance,
+                                           u64a end, const char in_catchup) {
+    if (!in_catchup &&
+        roseCatchUpMpvFeeders(t, scratch, end) == HWLM_TERMINATE_MATCHING) {
+        return HWLM_TERMINATE_MATCHING;
+    }
+    return roseHandleChainMatch(t, scratch, event, top_squash_distance, end,
+                                in_catchup);
+}
+
+static rose_inline
+void roseHandleSom(struct hs_scratch *scratch, const struct som_operation *sr,
+                   u64a end) {
+    DEBUG_PRINTF("end=%llu, minMatchOffset=%llu\n", end,
+                 scratch->tctxt.minMatchOffset);
+
+    updateLastMatchOffset(&scratch->tctxt, end);
+    handleSomInternal(scratch, sr, end);
+}
+
+static rose_inline
+hwlmcb_rv_t roseReportSom(const struct RoseEngine *t,
+                          struct hs_scratch *scratch, u64a start, u64a end,
+                          ReportID onmatch, s32 offset_adjust, u32 ekey) {
+    DEBUG_PRINTF("firing som callback onmatch=%u, start=%llu, end=%llu\n",
+                 onmatch, start, end);
+    updateLastMatchOffset(&scratch->tctxt, end);
+
+    int cb_rv = roseDeliverSomReport(start, end, onmatch, offset_adjust,
+                                     scratch, ekey);
+    if (cb_rv == MO_HALT_MATCHING) {
+        DEBUG_PRINTF("termination requested\n");
+        return HWLM_TERMINATE_MATCHING;
+    }
+
+    if (ekey == INVALID_EKEY || cb_rv == ROSE_CONTINUE_MATCHING_NO_EXHAUST) {
+        return HWLM_CONTINUE_MATCHING;
+    }
+
+    return roseHaltIfExhausted(t, scratch);
+}
+
+static rose_inline
+void roseHandleSomSom(struct hs_scratch *scratch,
+                      const struct som_operation *sr, u64a start, u64a end) {
+    DEBUG_PRINTF("start=%llu, end=%llu, minMatchOffset=%llu\n", start, end,
+                 scratch->tctxt.minMatchOffset);
+
+    updateLastMatchOffset(&scratch->tctxt, end);
+    setSomFromSomAware(scratch, sr, start, end);
+}
+
+static rose_inline
+hwlmcb_rv_t roseSetExhaust(const struct RoseEngine *t,
+                           struct hs_scratch *scratch, u32 ekey) {
+    assert(scratch);
+    assert(scratch->magic == SCRATCH_MAGIC);
+
+    struct core_info *ci = &scratch->core_info;
+
+    assert(!can_stop_matching(scratch));
+    assert(!isExhausted(ci->rose, ci->exhaustionVector, ekey));
+
+    markAsMatched(ci->rose, ci->exhaustionVector, ekey);
+
+    return roseHaltIfExhausted(t, scratch);
+}
+
+static really_inline
+int reachHasBit(const u8 *reach, u8 c) {
+    return !!(reach[c / 8U] & (u8)1U << (c % 8U));
+}
+
+/*
+ * Generate a 8-byte valid_mask with #high bytes 0 from the highest side
+ * and #low bytes 0 from the lowest side
+ * and (8 - high - low) bytes '0xff' in the middle.
+ */
+static rose_inline
+u64a generateValidMask(const s32 high, const s32 low) {
+    assert(high + low < 8);
+    DEBUG_PRINTF("high %d low %d\n", high, low);
+    const u64a ones = ~0ull;
+    return (ones << ((high + low) * 8)) >> (high * 8);
+}
+
+/*
+ * Do the single-byte check if only one lookaround entry exists
+ * and it's a single mask.
+ * Return success if the byte is in the future or before history
+ * (offset is greater than (history) buffer length).
+ */
+static rose_inline
+int roseCheckByte(const struct core_info *ci, u8 and_mask, u8 cmp_mask,
+                  u8 negation, s32 checkOffset, u64a end) {
+    DEBUG_PRINTF("end=%llu, buf_offset=%llu, buf_end=%llu\n", end,
+                 ci->buf_offset, ci->buf_offset + ci->len);
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        DEBUG_PRINTF("too early, fail\n");
+        return 0;
+    }
+
+    const s64a base_offset = end - ci->buf_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("checkOffset=%d offset=%lld\n", checkOffset, offset);
+    u8 c;
+    if (offset >= 0) {
+        if (offset >= (s64a)ci->len) {
+            DEBUG_PRINTF("in the future\n");
+            return 1;
+        } else {
+            assert(offset < (s64a)ci->len);
+            DEBUG_PRINTF("check byte in buffer\n");
+            c = ci->buf[offset];
+        }
+    } else {
+        if (offset >= -(s64a) ci->hlen) {
+            DEBUG_PRINTF("check byte in history\n");
+            c = ci->hbuf[ci->hlen + offset];
+        } else {
+            DEBUG_PRINTF("before history and return\n");
+            return 1;
+        }
+    }
+
+    if (((and_mask & c) != cmp_mask) ^ negation) {
+        DEBUG_PRINTF("char 0x%02x at offset %lld failed byte check\n",
+                     c, offset);
+        return 0;
+    }
+
+    DEBUG_PRINTF("real offset=%lld char=%02x\n", offset, c);
+    DEBUG_PRINTF("OK :)\n");
+    return 1;
+}
+
+static rose_inline
+int roseCheckMask(const struct core_info *ci, u64a and_mask, u64a cmp_mask,
+                  u64a neg_mask, s32 checkOffset, u64a end) {
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("rel offset %lld\n",base_offset);
+    DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        DEBUG_PRINTF("too early, fail\n");
+        return 0;
+    }
+
+    u64a data = 0;
+    u64a valid_data_mask = ~0ULL; // mask for validate check.
+    //A 0xff byte means that this byte is in the buffer.
+    s32 shift_l = 0; // size of bytes in the future.
+    s32 shift_r = 0; // size of bytes before the history.
+    s32 h_len = 0; // size of bytes in the history buffer.
+    s32 c_len = 8; // size of bytes in the current buffer.
+    if (offset < 0) {
+        // in or before history buffer.
+        if (offset + 8 <= -(s64a)ci->hlen) {
+            DEBUG_PRINTF("before history and return\n");
+            return 1;
+        }
+        const u8 *h_start = ci->hbuf; // start pointer in history buffer.
+        if (offset < -(s64a)ci->hlen) {
+            // some bytes are before history.
+            shift_r = -(offset + (s64a)ci->hlen);
+            DEBUG_PRINTF("shift_r %d", shift_r);
+        } else {
+            h_start += ci->hlen + offset;
+        }
+        if (offset + 7 < 0) {
+            DEBUG_PRINTF("all in history buffer\n");
+            data = partial_load_u64a(h_start, 8 - shift_r);
+        } else {
+            // history part
+            c_len = offset + 8;
+            h_len = -offset - shift_r;
+            DEBUG_PRINTF("%d bytes in history\n", h_len);
+            s64a data_h = 0;
+            data_h = partial_load_u64a(h_start, h_len);
+            // current part
+            if (c_len > (s64a)ci->len) {
+                shift_l = c_len - ci->len;
+                c_len = ci->len;
+            }
+            data = partial_load_u64a(ci->buf, c_len);
+            data <<= h_len << 3;
+            data |= data_h;
+        }
+        if (shift_r) {
+            data <<= shift_r << 3;
+        }
+    } else {
+        // current buffer.
+        if (offset + c_len > (s64a)ci->len) {
+            if (offset >= (s64a)ci->len) {
+                DEBUG_PRINTF("all in the future\n");
+                return 1;
+            }
+            // some  bytes in the future.
+            shift_l = offset + c_len - ci->len;
+            c_len = ci->len - offset;
+            data = partial_load_u64a(ci->buf + offset, c_len);
+        } else {
+            data = unaligned_load_u64a(ci->buf + offset);
+        }
+    }
+
+    if (shift_l || shift_r) {
+        valid_data_mask = generateValidMask(shift_l, shift_r);
+    }
+    DEBUG_PRINTF("valid_data_mask %llx\n", valid_data_mask);
+
+    if (validateMask(data, valid_data_mask,
+                     and_mask, cmp_mask, neg_mask)) {
+        DEBUG_PRINTF("check mask successfully\n");
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+static rose_inline
+int roseCheckMask32(const struct core_info *ci, const u8 *and_mask,
+                    const u8 *cmp_mask, const u32 neg_mask,
+                    s32 checkOffset, u64a end) {
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset);
+    DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
+
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        DEBUG_PRINTF("too early, fail\n");
+        return 0;
+    }
+
+    m256 data = zeroes256(); // consists of the following four parts.
+    s32 c_shift = 0; // blank bytes after current.
+    s32 h_shift = 0; // blank bytes before history.
+    s32 h_len = 32; // number of bytes from history buffer.
+    s32 c_len = 0; // number of bytes from current buffer.
+    /* h_shift + h_len + c_len + c_shift = 32 need to be hold.*/
+
+    if (offset < 0) {
+        s32 h_offset = 0; // the start offset in history buffer.
+        if (offset < -(s64a)ci->hlen) {
+            if (offset + 32 <= -(s64a)ci->hlen) {
+                DEBUG_PRINTF("all before history\n");
+                return 1;
+            }
+            h_shift = -(offset + (s64a)ci->hlen);
+            h_len = 32 - h_shift;
+        } else {
+            h_offset = ci->hlen + offset;
+        }
+        if (offset + 32 > 0) {
+            // part in current buffer.
+            c_len = offset + 32;
+            h_len = -(offset + h_shift);
+            if (c_len > (s64a)ci->len) {
+                // out of current buffer.
+                c_shift = c_len - ci->len;
+                c_len = ci->len;
+            }
+            copy_upto_64_bytes((u8 *)&data - offset, ci->buf, c_len);
+        }
+        assert(h_shift + h_len + c_len + c_shift == 32);
+        copy_upto_64_bytes((u8 *)&data + h_shift, ci->hbuf + h_offset, h_len);
+    } else {
+        if (offset + 32 > (s64a)ci->len) {
+            if (offset >= (s64a)ci->len) {
+                DEBUG_PRINTF("all in the future.\n");
+                return 1;
+            }
+            c_len = ci->len - offset;
+            c_shift = 32 - c_len;
+            assert(c_len <= 32);
+            copy_upto_64_bytes((u8 *)&data, ci->buf + offset, c_len);
+        } else {
+            data = loadu256(ci->buf + offset);
+        }
+    }
+    DEBUG_PRINTF("h_shift %d c_shift %d\n", h_shift, c_shift);
+    DEBUG_PRINTF("h_len %d c_len %d\n", h_len, c_len);
+    // we use valid_data_mask to blind bytes before history/in the future.
+    u32 valid_data_mask;
+    valid_data_mask = (~0u) << (h_shift + c_shift) >> (c_shift);
+
+    m256 and_mask_m256 = loadu256(and_mask);
+    m256 cmp_mask_m256 = loadu256(cmp_mask);
+    if (validateMask32(data, valid_data_mask, and_mask_m256,
+                       cmp_mask_m256, neg_mask)) {
+        DEBUG_PRINTF("Mask32 passed\n");
+        return 1;
+    }
+    return 0;
+}
+
+#ifdef HAVE_AVX512
+static rose_inline
+int roseCheckMask64(const struct core_info *ci, const u8 *and_mask,
+                    const u8 *cmp_mask, const u64a neg_mask,
+                    s32 checkOffset, u64a end) {
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset);
+    DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
+
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        DEBUG_PRINTF("too early, fail\n");
+        return 0;
+    }
+
+    m512 data = zeroes512(); // consists of the following four parts.
+    s32 c_shift = 0; // blank bytes after current.
+    s32 h_shift = 0; // blank bytes before history.
+    s32 h_len = 64; // number of bytes from history buffer.
+    s32 c_len = 0; // number of bytes from current buffer.
+    /* h_shift + h_len + c_len + c_shift = 64 need to be hold.*/
+
+    if (offset < 0) {
+        s32 h_offset = 0; // the start offset in history buffer.
+        if (offset < -(s64a)ci->hlen) {
+            if (offset + 64 <= -(s64a)ci->hlen) {
+                DEBUG_PRINTF("all before history\n");
+                return 1;
+            }
+            h_shift = -(offset + (s64a)ci->hlen);
+            h_len = 64 - h_shift;
+        } else {
+            h_offset = ci->hlen + offset;
+        }
+        if (offset + 64 > 0) {
+            // part in current buffer.
+            c_len = offset + 64;
+            h_len = -(offset + h_shift);
+            if (c_len > (s64a)ci->len) {
+                // out of current buffer.
+                c_shift = c_len - ci->len;
+                c_len = ci->len;
+            }
+            copy_upto_64_bytes((u8 *)&data - offset, ci->buf, c_len);
+        }
+        assert(h_shift + h_len + c_len + c_shift == 64);
+        copy_upto_64_bytes((u8 *)&data + h_shift, ci->hbuf + h_offset, h_len);
+    } else {
+        if (offset + 64 > (s64a)ci->len) {
+            if (offset >= (s64a)ci->len) {
+                DEBUG_PRINTF("all in the future.\n");
+                return 1;
+            }
+            c_len = ci->len - offset;
+            c_shift = 64 - c_len;
+            copy_upto_64_bytes((u8 *)&data, ci->buf + offset, c_len);
+        } else {
+            data = loadu512(ci->buf + offset);
+        }
+    }
+    DEBUG_PRINTF("h_shift %d c_shift %d\n", h_shift, c_shift);
+    DEBUG_PRINTF("h_len %d c_len %d\n", h_len, c_len);
+    // we use valid_data_mask to blind bytes before history/in the future.
+    u64a valid_data_mask;
+    valid_data_mask = (~0ULL) << (h_shift + c_shift) >> (c_shift);
+
+    m512 and_mask_m512 = loadu512(and_mask);
+    m512 cmp_mask_m512 = loadu512(cmp_mask);
+
+    if (validateMask64(data, valid_data_mask, and_mask_m512,
+                       cmp_mask_m512, neg_mask)) {
+        DEBUG_PRINTF("Mask64 passed\n");
+        return 1;
+    }
+    return 0;
+}
+#endif
+
+// get 128/256/512 bits data from history and current buffer.
+// return data and valid_data_mask.
+static rose_inline
+u64a getBufferDataComplex(const struct core_info *ci, const s64a loc,
+                         u8 *data, const u32 data_len) {
+    assert(data_len == 16 || data_len == 32 || data_len == 64);
+    s32 c_shift = 0; // blank bytes after current.
+    s32 h_shift = 0; // blank bytes before history.
+    s32 h_len = data_len; // number of bytes from history buffer.
+    s32 c_len = 0; // number of bytes from current buffer.
+    if (loc < 0) {
+        s32 h_offset = 0; // the start offset in history buffer.
+        if (loc < -(s64a)ci->hlen) {
+            if (loc + data_len <= -(s64a)ci->hlen) {
+                DEBUG_PRINTF("all before history\n");
+                return 0;
+            }
+            h_shift = -(loc + (s64a)ci->hlen);
+            h_len = data_len - h_shift;
+        } else {
+            h_offset = ci->hlen + loc;
+        }
+        if (loc + data_len > 0) {
+            // part in current buffer.
+            c_len = loc + data_len;
+            h_len = -(loc + h_shift);
+            if (c_len > (s64a)ci->len) {
+                // out of current buffer.
+                c_shift = c_len - ci->len;
+                c_len = ci->len;
+            }
+            copy_upto_64_bytes(data - loc, ci->buf, c_len);
+        }
+        assert(h_shift + h_len + c_len + c_shift == (s32)data_len);
+        copy_upto_64_bytes(data + h_shift, ci->hbuf + h_offset, h_len);
+    } else {
+        if (loc + data_len > (s64a)ci->len) {
+            if (loc >= (s64a)ci->len) {
+                DEBUG_PRINTF("all in the future.\n");
+                return 0;
+            }
+            c_len = ci->len - loc;
+            c_shift = data_len - c_len;
+            assert(c_len <= data_len);
+            copy_upto_64_bytes(data, ci->buf + loc, c_len);
+        } else {
+#ifdef HAVE_AVX512
+            if (data_len == 64) {
+                storeu512(data, loadu512(ci->buf + loc));
+                return ~0ULL;
+            }
+#endif
+            if (data_len == 16) {
+                storeu128(data, loadu128(ci->buf + loc));
+                return 0xffff;
+            } else {
+                storeu256(data, loadu256(ci->buf + loc));
+                return 0xffffffff;
+            }
+        }
+    }
+    DEBUG_PRINTF("h_shift %d c_shift %d\n", h_shift, c_shift);
+    DEBUG_PRINTF("h_len %d c_len %d\n", h_len, c_len);
+
+#ifdef HAVE_AVX512
+    if (data_len == 64) {
+        return (~0ULL) << (h_shift + c_shift) >> c_shift;
+    }
+#endif
+    if (data_len == 16) {
+        return (u16)(0xffff << (h_shift + c_shift)) >> c_shift;
+    } else {
+        return (~0u) << (h_shift + c_shift) >> c_shift;
+    }
+}
+
+static rose_inline
+m128 getData128(const struct core_info *ci, s64a offset, u32 *valid_data_mask) {
+    if (offset > 0 && offset + sizeof(m128) <= ci->len) {
+        *valid_data_mask = 0xffff;
+        return loadu128(ci->buf + offset);
+    }
+    ALIGN_DIRECTIVE u8 data[sizeof(m128)];
+    *valid_data_mask = getBufferDataComplex(ci, offset, data, 16);
+    return *(m128 *)data;
+}
+
+static rose_inline
+m256 getData256(const struct core_info *ci, s64a offset, u32 *valid_data_mask) {
+    if (offset > 0 && offset + sizeof(m256) <= ci->len) {
+        *valid_data_mask = ~0u;
+        return loadu256(ci->buf + offset);
+    }
+    ALIGN_AVX_DIRECTIVE u8 data[sizeof(m256)];
+    *valid_data_mask = getBufferDataComplex(ci, offset, data, 32);
+    return *(m256 *)data;
+}
+
+#ifdef HAVE_AVX512
+static rose_inline
+m512 getData512(const struct core_info *ci, s64a offset, u64a *valid_data_mask) {
+    if (offset > 0 && offset + sizeof(m512) <= ci->len) {
+        *valid_data_mask = ~0ULL;
+        return loadu512(ci->buf + offset);
+    }
+    ALIGN_CL_DIRECTIVE u8 data[sizeof(m512)];
+    *valid_data_mask = getBufferDataComplex(ci, offset, data, 64);
+    return *(m512 *)data;
+}
+#endif
+
+static rose_inline
+int roseCheckShufti16x8(const struct core_info *ci, const u8 *nib_mask,
+                        const u8 *bucket_select_mask, u32 neg_mask,
+                        s32 checkOffset, u64a end) {
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset);
+    DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
+
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        DEBUG_PRINTF("too early, fail\n");
+        return 0;
+    }
+
+    u32 valid_data_mask = 0;
+    m128 data = getData128(ci, offset, &valid_data_mask);
+    if (unlikely(!valid_data_mask)) {
+        return 1;
+    }
+
+    m256 nib_mask_m256 = loadu256(nib_mask);
+    m128 bucket_select_mask_m128 = loadu128(bucket_select_mask);
+    if (validateShuftiMask16x8(data, nib_mask_m256,
+                               bucket_select_mask_m128,
+                               neg_mask, valid_data_mask)) {
+        DEBUG_PRINTF("check shufti 16x8 successfully\n");
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+static rose_inline
+int roseCheckShufti16x16(const struct core_info *ci, const u8 *hi_mask,
+                         const u8 *lo_mask, const u8 *bucket_select_mask,
+                         u32 neg_mask, s32 checkOffset, u64a end) {
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset);
+    DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
+
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        DEBUG_PRINTF("too early, fail\n");
+        return 0;
+    }
+
+    u32 valid_data_mask = 0;
+    m128 data = getData128(ci, offset, &valid_data_mask);
+    if (unlikely(!valid_data_mask)) {
+        return 1;
+    }
+
+    m256 data_m256 = set2x128(data);
+    m256 hi_mask_m256 = loadu256(hi_mask);
+    m256 lo_mask_m256 = loadu256(lo_mask);
+    m256 bucket_select_mask_m256 = loadu256(bucket_select_mask);
+    if (validateShuftiMask16x16(data_m256, hi_mask_m256, lo_mask_m256,
+                                bucket_select_mask_m256,
+                                neg_mask, valid_data_mask)) {
+        DEBUG_PRINTF("check shufti 16x16 successfully\n");
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+static rose_inline
+int roseCheckShufti32x8(const struct core_info *ci, const u8 *hi_mask,
+                        const u8 *lo_mask, const u8 *bucket_select_mask,
+                        u32 neg_mask, s32 checkOffset, u64a end) {
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset);
+    DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
+
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        DEBUG_PRINTF("too early, fail\n");
+        return 0;
+    }
+
+    u32 valid_data_mask = 0;
+    m256 data = getData256(ci, offset, &valid_data_mask);
+    if (unlikely(!valid_data_mask)) {
+        return 1;
+    }
+
+    m128 hi_mask_m128 = loadu128(hi_mask);
+    m128 lo_mask_m128 = loadu128(lo_mask);
+    m256 hi_mask_m256 = set2x128(hi_mask_m128);
+    m256 lo_mask_m256 = set2x128(lo_mask_m128);
+    m256 bucket_select_mask_m256 = loadu256(bucket_select_mask);
+    if (validateShuftiMask32x8(data, hi_mask_m256, lo_mask_m256,
+                               bucket_select_mask_m256,
+                               neg_mask, valid_data_mask)) {
+        DEBUG_PRINTF("check shufti 32x8 successfully\n");
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+static rose_inline
+int roseCheckShufti32x16(const struct core_info *ci, const u8 *hi_mask,
+                         const u8 *lo_mask, const u8 *bucket_select_mask_hi,
+                         const u8 *bucket_select_mask_lo, u32 neg_mask,
+                         s32 checkOffset, u64a end) {
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset);
+    DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
+
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        DEBUG_PRINTF("too early, fail\n");
+        return 0;
+    }
+
+    u32 valid_data_mask = 0;
+    m256 data = getData256(ci, offset, &valid_data_mask);
+    if (unlikely(!valid_data_mask)) {
+        return 1;
+    }
+
+    m256 hi_mask_1 = loadu2x128(hi_mask);
+    m256 hi_mask_2 = loadu2x128(hi_mask + 16);
+    m256 lo_mask_1 = loadu2x128(lo_mask);
+    m256 lo_mask_2 = loadu2x128(lo_mask + 16);
+
+    m256 bucket_mask_hi = loadu256(bucket_select_mask_hi);
+    m256 bucket_mask_lo = loadu256(bucket_select_mask_lo);
+    if (validateShuftiMask32x16(data, hi_mask_1, hi_mask_2,
+                                lo_mask_1, lo_mask_2, bucket_mask_hi,
+                                bucket_mask_lo, neg_mask, valid_data_mask)) {
+        DEBUG_PRINTF("check shufti 32x16 successfully\n");
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+#ifdef HAVE_AVX512
+static rose_inline
+int roseCheckShufti64x8(const struct core_info *ci, const u8 *hi_mask,
+                        const u8 *lo_mask, const u8 *bucket_select_mask,
+                        u64a neg_mask, s32 checkOffset, u64a end) {
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset);
+    DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
+
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        DEBUG_PRINTF("too early, fail\n");
+        return 0;
+    }
+
+    u64a valid_data_mask = 0;
+    m512 data = getData512(ci, offset, &valid_data_mask);
+
+    if (unlikely(!valid_data_mask)) {
+        return 1;
+    }
+
+    m512 hi_mask_m512 = loadu512(hi_mask);
+    m512 lo_mask_m512 = loadu512(lo_mask);
+    m512 bucket_select_mask_m512 = loadu512(bucket_select_mask);
+    if (validateShuftiMask64x8(data, hi_mask_m512, lo_mask_m512,
+                               bucket_select_mask_m512,
+                               neg_mask, valid_data_mask)) {
+        DEBUG_PRINTF("check shufti 64x8 successfully\n");
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+static rose_inline
+int roseCheckShufti64x16(const struct core_info *ci, const u8 *hi_mask_1,
+                         const u8 *hi_mask_2, const u8 *lo_mask_1,
+                         const u8 *lo_mask_2, const u8 *bucket_select_mask_hi,
+                         const u8 *bucket_select_mask_lo, u64a neg_mask,
+                         s32 checkOffset, u64a end) {
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset);
+    DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
+
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        DEBUG_PRINTF("too early, fail\n");
+        return 0;
+    }
+
+    u64a valid_data_mask = 0;
+    m512 data = getData512(ci, offset, &valid_data_mask);
+    if (unlikely(!valid_data_mask)) {
+        return 1;
+    }
+
+    m512 hi_mask_1_m512 = loadu512(hi_mask_1);
+    m512 hi_mask_2_m512 = loadu512(hi_mask_2);
+    m512 lo_mask_1_m512 = loadu512(lo_mask_1);
+    m512 lo_mask_2_m512 = loadu512(lo_mask_2);
+
+    m512 bucket_select_mask_hi_m512 = loadu512(bucket_select_mask_hi);
+    m512 bucket_select_mask_lo_m512 = loadu512(bucket_select_mask_lo);
+    if (validateShuftiMask64x16(data, hi_mask_1_m512, hi_mask_2_m512,
+                              lo_mask_1_m512, lo_mask_2_m512,
+                              bucket_select_mask_hi_m512,
+                              bucket_select_mask_lo_m512,
+                              neg_mask, valid_data_mask)) {
+        DEBUG_PRINTF("check shufti 64x16 successfully\n");
+        return 1;
+    } else {
+        return 0;
+    }
+}
+#endif
+
+static rose_inline
+int roseCheckSingleLookaround(const struct RoseEngine *t,
+                              const struct hs_scratch *scratch,
+                              s8 checkOffset, u32 lookaroundReachIndex,
+                              u64a end) {
+    assert(lookaroundReachIndex != MO_INVALID_IDX);
+    const struct core_info *ci = &scratch->core_info;
+    DEBUG_PRINTF("end=%llu, buf_offset=%llu, buf_end=%llu\n", end,
+                 ci->buf_offset, ci->buf_offset + ci->len);
+
+    const s64a base_offset = end - ci->buf_offset;
+    const s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("base_offset=%lld\n", base_offset);
+    DEBUG_PRINTF("checkOffset=%d offset=%lld\n", checkOffset, offset);
+
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        DEBUG_PRINTF("too early, fail\n");
+        return 0;
+    }
+
+    const u8 *reach = getByOffset(t, lookaroundReachIndex);
+
+    u8 c;
+    if (offset >= 0 && offset < (s64a)ci->len) {
+        c = ci->buf[offset];
+    } else if (offset < 0 && offset >= -(s64a)ci->hlen) {
+        c = ci->hbuf[ci->hlen + offset];
+    } else {
+        return 1;
+    }
+
+    if (!reachHasBit(reach, c)) {
+        DEBUG_PRINTF("char 0x%02x failed reach check\n", c);
+        return 0;
+    }
+
+    DEBUG_PRINTF("OK :)\n");
+    return 1;
+}
+
+/**
+ * \brief Scan around a literal, checking that that "lookaround" reach masks
+ * are satisfied.
+ */
+static rose_inline
+int roseCheckLookaround(const struct RoseEngine *t,
+                        const struct hs_scratch *scratch,
+                        u32 lookaroundLookIndex, u32 lookaroundReachIndex,
+                        u32 lookaroundCount, u64a end) {
+    assert(lookaroundLookIndex != MO_INVALID_IDX);
+    assert(lookaroundReachIndex != MO_INVALID_IDX);
+    assert(lookaroundCount > 0);
+
+    const struct core_info *ci = &scratch->core_info;
+    DEBUG_PRINTF("end=%llu, buf_offset=%llu, buf_end=%llu\n", end,
+                 ci->buf_offset, ci->buf_offset + ci->len);
+
+    const s8 *look = getByOffset(t, lookaroundLookIndex);
+    const s8 *look_end = look + lookaroundCount;
+    assert(look < look_end);
+
+    const u8 *reach = getByOffset(t, lookaroundReachIndex);
+
+    // The following code assumes that the lookaround structures are ordered by
+    // increasing offset.
+
+    const s64a base_offset = end - ci->buf_offset;
+    DEBUG_PRINTF("base_offset=%lld\n", base_offset);
+    DEBUG_PRINTF("first look has offset %d\n", *look);
+
+    // If our first check tells us we need to look at an offset before the
+    // start of the stream, this role cannot match.
+    if (unlikely(*look < 0 && (u64a)(0 - *look) > end)) {
+        DEBUG_PRINTF("too early, fail\n");
+        return 0;
+    }
+
+    // Skip over offsets that are before the history buffer.
+    do {
+        s64a offset = base_offset + *look;
+        if (offset >= -(s64a)ci->hlen) {
+            goto in_history;
+        }
+        DEBUG_PRINTF("look=%d before history\n", *look);
+        look++;
+        reach += REACH_BITVECTOR_LEN;
+    } while (look < look_end);
+
+    // History buffer.
+    DEBUG_PRINTF("scan history (%zu looks left)\n", look_end - look);
+    for (; look < look_end; ++look, reach += REACH_BITVECTOR_LEN) {
+    in_history:
+        ;
+        s64a offset = base_offset + *look;
+        DEBUG_PRINTF("reach=%p, rel offset=%lld\n", reach, offset);
+
+        if (offset >= 0) {
+            DEBUG_PRINTF("in buffer\n");
+            goto in_buffer;
+        }
+
+        assert(offset >= -(s64a)ci->hlen && offset < 0);
+        u8 c = ci->hbuf[ci->hlen + offset];
+        if (!reachHasBit(reach, c)) {
+            DEBUG_PRINTF("char 0x%02x failed reach check\n", c);
+            return 0;
+        }
+    }
+    // Current buffer.
+    DEBUG_PRINTF("scan buffer (%zu looks left)\n", look_end - look);
+    for (; look < look_end; ++look, reach += REACH_BITVECTOR_LEN) {
+    in_buffer:
+        ;
+        s64a offset = base_offset + *look;
+        DEBUG_PRINTF("reach=%p, rel offset=%lld\n", reach, offset);
+
+        if (offset >= (s64a)ci->len) {
+            DEBUG_PRINTF("in the future\n");
+            break;
+        }
+
+        assert(offset >= 0 && offset < (s64a)ci->len);
+        u8 c = ci->buf[offset];
+        if (!reachHasBit(reach, c)) {
+            DEBUG_PRINTF("char 0x%02x failed reach check\n", c);
+            return 0;
+        }
+    }
+
+    DEBUG_PRINTF("OK :)\n");
+    return 1;
+}
+
+/**
+ * \brief Trying to find a matching path by the corresponding path mask of
+ * every lookaround location.
+ */
+static rose_inline
+int roseMultipathLookaround(const struct RoseEngine *t,
+                            const struct hs_scratch *scratch,
+                            u32 multipathLookaroundLookIndex,
+                            u32 multipathLookaroundReachIndex,
+                            u32 multipathLookaroundCount,
+                            s32 last_start, const u8 *start_mask,
+                            u64a end) {
+    assert(multipathLookaroundCount > 0);
+
+    const struct core_info *ci = &scratch->core_info;
+    DEBUG_PRINTF("end=%llu, buf_offset=%llu, buf_end=%llu\n", end,
+                 ci->buf_offset, ci->buf_offset + ci->len);
+
+    const s8 *look = getByOffset(t, multipathLookaroundLookIndex);
+    const s8 *look_end = look + multipathLookaroundCount;
+    assert(look < look_end);
+
+    const u8 *reach = getByOffset(t, multipathLookaroundReachIndex);
+
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    DEBUG_PRINTF("base_offset=%lld\n", base_offset);
+
+    u8 path = 0xff;
+
+    assert(last_start < 0);
+
+    if (unlikely((u64a)(0 - last_start) > end)) {
+        DEBUG_PRINTF("too early, fail\n");
+        return 0;
+    }
+
+    s8 base_look_offset = *look;
+    do {
+        s64a offset = base_offset + *look;
+        u32 start_offset = (u32)(*look - base_look_offset);
+        DEBUG_PRINTF("start_mask[%u] = %x\n", start_offset,
+                     start_mask[start_offset]);
+        path = start_mask[start_offset];
+        if (offset >= -(s64a)ci->hlen) {
+            break;
+        }
+        DEBUG_PRINTF("look=%d before history\n", *look);
+        look++;
+        reach += MULTI_REACH_BITVECTOR_LEN;
+    } while (look < look_end);
+
+    DEBUG_PRINTF("scan history (%zu looks left)\n", look_end - look);
+    for (; look < look_end; ++look, reach += MULTI_REACH_BITVECTOR_LEN) {
+        s64a offset = base_offset + *look;
+        DEBUG_PRINTF("reach=%p, rel offset=%lld\n", reach, offset);
+
+        if (offset >= 0) {
+            DEBUG_PRINTF("in buffer\n");
+            break;
+        }
+
+        assert(offset >= -(s64a)ci->hlen && offset < 0);
+        u8 c = ci->hbuf[ci->hlen + offset];
+        path &= reach[c];
+        DEBUG_PRINTF("reach[%x] = %02x path = %0xx\n", c, reach[c],  path);
+        if (!path) {
+            DEBUG_PRINTF("char 0x%02x failed reach check\n", c);
+            return 0;
+        }
+    }
+
+    DEBUG_PRINTF("scan buffer (%zu looks left)\n", look_end - look);
+    for(; look < look_end; ++look, reach += MULTI_REACH_BITVECTOR_LEN) {
+        s64a offset = base_offset + *look;
+        DEBUG_PRINTF("reach=%p, rel offset=%lld\n", reach, offset);
+
+        if (offset >= (s64a)ci->len) {
+            DEBUG_PRINTF("in the future\n");
+            break;
+        }
+
+        assert(offset >= 0 && offset < (s64a)ci->len);
+        u8 c = ci->buf[offset];
+        path &= reach[c];
+        DEBUG_PRINTF("reach[%x] = %02x path = %0xx\n", c, reach[c],  path);
+        if (!path) {
+            DEBUG_PRINTF("char 0x%02x failed reach check\n", c);
+            return 0;
+        }
+    }
+
+    DEBUG_PRINTF("OK :)\n");
+    return 1;
+}
+
+static never_inline
+int roseCheckMultipathShufti16x8(const struct hs_scratch *scratch,
+                       const struct ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_16x8 *ri,
+                                 u64a end) {
+    const struct core_info *ci = &scratch->core_info;
+    s32 checkOffset = ri->base_offset;
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset);
+    DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
+
+    assert(ri->last_start <= 0);
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        if ((u64a)(0 - ri->last_start) > end) {
+            DEBUG_PRINTF("too early, fail\n");
+            return 0;
+        }
+    }
+
+    u32 valid_data_mask;
+    m128 data_init = getData128(ci, offset, &valid_data_mask);
+    m128 data_select_mask = loadu128(ri->data_select_mask);
+
+    u32 valid_path_mask = 0;
+    if (unlikely(!(valid_data_mask & 1))) {
+        DEBUG_PRINTF("lose part of backward data\n");
+        DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask);
+
+        m128 expand_valid;
+        u64a expand_mask = 0x8080808080808080ULL;
+        u64a valid_lo = expand64(valid_data_mask & 0xff, expand_mask);
+        u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
+        DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
+        DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
+        expand_valid = set64x2(valid_hi, valid_lo);
+        valid_path_mask = ~movemask128(pshufb_m128(expand_valid,
+                                               data_select_mask));
+    }
+
+    m128 data = pshufb_m128(data_init, data_select_mask);
+    m256 nib_mask = loadu256(ri->nib_mask);
+    m128 bucket_select_mask = loadu128(ri->bucket_select_mask);
+
+    u32 hi_bits_mask = ri->hi_bits_mask;
+    u32 lo_bits_mask = ri->lo_bits_mask;
+    u32 neg_mask = ri->neg_mask;
+
+    if (validateMultipathShuftiMask16x8(data, nib_mask,
+                                        bucket_select_mask,
+                                        hi_bits_mask, lo_bits_mask,
+                                        neg_mask, valid_path_mask)) {
+        DEBUG_PRINTF("check multi-path shufti-16x8 successfully\n");
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+static never_inline
+int roseCheckMultipathShufti32x8(const struct hs_scratch *scratch,
+                       const struct ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_32x8 *ri,
+                                 u64a end) {
+    const struct core_info *ci = &scratch->core_info;
+    s32 checkOffset = ri->base_offset;
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset);
+    DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
+
+    assert(ri->last_start <= 0);
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        if ((u64a)(0 - ri->last_start) > end) {
+            DEBUG_PRINTF("too early, fail\n");
+            return 0;
+        }
+    }
+
+    u32 valid_data_mask;
+    m128 data_m128 = getData128(ci, offset, &valid_data_mask);
+    m256 data_double = set2x128(data_m128);
+    m256 data_select_mask = loadu256(ri->data_select_mask);
+
+    u32 valid_path_mask = 0;
+    m256 expand_valid;
+    if (unlikely(!(valid_data_mask & 1))) {
+        DEBUG_PRINTF("lose part of backward data\n");
+        DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask);
+
+        u64a expand_mask = 0x8080808080808080ULL;
+        u64a valid_lo = expand64(valid_data_mask & 0xff, expand_mask);
+        u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
+        DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
+        DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
+        expand_valid = set64x4(valid_hi, valid_lo, valid_hi,
+                                         valid_lo);
+        valid_path_mask = ~movemask256(pshufb_m256(expand_valid,
+                                                  data_select_mask));
+    }
+
+    m256 data = pshufb_m256(data_double, data_select_mask);
+    m256 hi_mask = loadu2x128(ri->hi_mask);
+    m256 lo_mask = loadu2x128(ri->lo_mask);
+    m256 bucket_select_mask = loadu256(ri->bucket_select_mask);
+
+    u32 hi_bits_mask = ri->hi_bits_mask;
+    u32 lo_bits_mask = ri->lo_bits_mask;
+    u32 neg_mask = ri->neg_mask;
+
+    if (validateMultipathShuftiMask32x8(data, hi_mask, lo_mask,
+                                        bucket_select_mask,
+                                        hi_bits_mask, lo_bits_mask,
+                                        neg_mask, valid_path_mask)) {
+        DEBUG_PRINTF("check multi-path shufti-32x8 successfully\n");
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+static never_inline
+int roseCheckMultipathShufti32x16(const struct hs_scratch *scratch,
+                      const struct ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_32x16 *ri,
+                                  u64a end) {
+    const struct core_info *ci = &scratch->core_info;
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    s32 checkOffset = ri->base_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset);
+    DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
+
+    assert(ri->last_start <= 0);
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        if ((u64a)(0 - ri->last_start) > end) {
+            DEBUG_PRINTF("too early, fail\n");
+            return 0;
+        }
+    }
+
+    u32 valid_data_mask;
+    m128 data_m128 = getData128(ci, offset, &valid_data_mask);
+    m256 data_double = set2x128(data_m128);
+    m256 data_select_mask = loadu256(ri->data_select_mask);
+
+    u32 valid_path_mask = 0;
+    m256 expand_valid;
+    if (unlikely(!(valid_data_mask & 1))) {
+        DEBUG_PRINTF("lose part of backward data\n");
+        DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask);
+
+        u64a expand_mask = 0x8080808080808080ULL;
+        u64a valid_lo = expand64(valid_data_mask & 0xff, expand_mask);
+        u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
+        DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
+        DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
+        expand_valid = set64x4(valid_hi, valid_lo, valid_hi,
+                                         valid_lo);
+        valid_path_mask = ~movemask256(pshufb_m256(expand_valid,
+                                                   data_select_mask));
+    }
+
+    m256 data = pshufb_m256(data_double, data_select_mask);
+
+    m256 hi_mask_1 = loadu2x128(ri->hi_mask);
+    m256 hi_mask_2 = loadu2x128(ri->hi_mask + 16);
+    m256 lo_mask_1 = loadu2x128(ri->lo_mask);
+    m256 lo_mask_2 = loadu2x128(ri->lo_mask + 16);
+
+    m256 bucket_select_mask_hi = loadu256(ri->bucket_select_mask_hi);
+    m256 bucket_select_mask_lo = loadu256(ri->bucket_select_mask_lo);
+
+    u32 hi_bits_mask = ri->hi_bits_mask;
+    u32 lo_bits_mask = ri->lo_bits_mask;
+    u32 neg_mask = ri->neg_mask;
+
+    if (validateMultipathShuftiMask32x16(data, hi_mask_1, hi_mask_2,
+                                         lo_mask_1, lo_mask_2,
+                                         bucket_select_mask_hi,
+                                         bucket_select_mask_lo,
+                                         hi_bits_mask, lo_bits_mask,
+                                         neg_mask, valid_path_mask)) {
+        DEBUG_PRINTF("check multi-path shufti-32x16 successfully\n");
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+static never_inline
+int roseCheckMultipathShufti64(const struct hs_scratch *scratch,
+                         const struct ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_64 *ri,
+                               u64a end) {
+    const struct core_info *ci = &scratch->core_info;
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    s32 checkOffset = ri->base_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset);
+    DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
+
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        if ((u64a)(0 - ri->last_start) > end) {
+            DEBUG_PRINTF("too early, fail\n");
+            return 0;
+        }
+    }
+
+    u32 valid_data_mask;
+    m128 data_m128 = getData128(ci, offset, &valid_data_mask);
+    m256 data_m256 = set2x128(data_m128);
+    m256 data_select_mask_1 = loadu256(ri->data_select_mask);
+    m256 data_select_mask_2 = loadu256(ri->data_select_mask + 32);
+
+    u64a valid_path_mask = 0;
+    m256 expand_valid;
+    if (unlikely(!(valid_data_mask & 1))) {
+        DEBUG_PRINTF("lose part of backward data\n");
+        DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask);
+
+        u64a expand_mask = 0x8080808080808080ULL;
+        u64a valid_lo = expand64(valid_data_mask & 0xff, expand_mask);
+        u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
+        DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
+        DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
+        expand_valid = set64x4(valid_hi, valid_lo, valid_hi,
+                                         valid_lo);
+        u32 valid_path_1 = movemask256(pshufb_m256(expand_valid,
+                                                   data_select_mask_1));
+        u32 valid_path_2 = movemask256(pshufb_m256(expand_valid,
+                                                   data_select_mask_2));
+        valid_path_mask = ~((u64a)valid_path_1 | (u64a)valid_path_2 << 32);
+    }
+
+    m256 data_1 = pshufb_m256(data_m256, data_select_mask_1);
+    m256 data_2 = pshufb_m256(data_m256, data_select_mask_2);
+
+    m256 hi_mask = loadu2x128(ri->hi_mask);
+    m256 lo_mask = loadu2x128(ri->lo_mask);
+
+    m256 bucket_select_mask_1 = loadu256(ri->bucket_select_mask);
+    m256 bucket_select_mask_2 = loadu256(ri->bucket_select_mask + 32);
+
+    u64a hi_bits_mask = ri->hi_bits_mask;
+    u64a lo_bits_mask = ri->lo_bits_mask;
+    u64a neg_mask = ri->neg_mask;
+
+    if (validateMultipathShuftiMask64(data_1, data_2, hi_mask, lo_mask,
+                                      bucket_select_mask_1,
+                                      bucket_select_mask_2, hi_bits_mask,
+                                      lo_bits_mask, neg_mask,
+                                      valid_path_mask)) {
+        DEBUG_PRINTF("check multi-path shufti-64 successfully\n");
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+static rose_inline
+int roseNfaEarliestSom(u64a start, UNUSED u64a end, UNUSED ReportID id,
+                       void *context) {
+    assert(context);
+    u64a *som = context;
+    *som = MIN(*som, start);
+    return MO_CONTINUE_MATCHING;
+}
+
+static rose_inline
+u64a roseGetHaigSom(const struct RoseEngine *t, struct hs_scratch *scratch,
+                    const u32 qi, UNUSED const u32 leftfixLag) {
+    u32 ri = queueToLeftIndex(t, qi);
+
+    UNUSED const struct LeftNfaInfo *left = getLeftTable(t) + ri;
+
+    DEBUG_PRINTF("testing %s prefix %u/%u with lag %u (maxLag=%u)\n",
+                 left->transient ? "transient" : "active", ri, qi,
+                 leftfixLag, left->maxLag);
+
+    assert(leftfixLag <= left->maxLag);
+
+    struct mq *q = scratch->queues + qi;
+
+    u64a start = ~0ULL;
+
+    /* switch the callback + context for a fun one */
+    q->cb = roseNfaEarliestSom;
+    q->context = &start;
+
+    nfaReportCurrentMatches(q->nfa, q);
+
+    /* restore the old callback + context */
+    q->cb = roseNfaAdaptor;
+    q->context = NULL;
+    DEBUG_PRINTF("earliest som is %llu\n", start);
+    return start;
+}
+
+static rose_inline
+char roseCheckBounds(u64a end, u64a min_bound, u64a max_bound) {
+    DEBUG_PRINTF("check offset=%llu against bounds [%llu,%llu]\n", end,
+                 min_bound, max_bound);
+    assert(min_bound <= max_bound);
+    return end >= min_bound && end <= max_bound;
+}
+
+static rose_inline
+hwlmcb_rv_t roseEnginesEod(const struct RoseEngine *rose,
+                           struct hs_scratch *scratch, u64a offset,
+                           u32 iter_offset) {
+    const char is_streaming = rose->mode != HS_MODE_BLOCK;
+
+    /* data, len is used for state decompress, should be full available data */
+    u8 key = 0;
+    if (is_streaming) {
+        const u8 *eod_data = scratch->core_info.hbuf;
+        size_t eod_len = scratch->core_info.hlen;
+        key = eod_len ? eod_data[eod_len - 1] : 0;
+    }
+
+    const u8 *aa = getActiveLeafArray(rose, scratch->core_info.state);
+    const u32 aaCount = rose->activeArrayCount;
+    const u32 qCount = rose->queueCount;
+    struct fatbit *aqa = scratch->aqa;
+
+    const struct mmbit_sparse_iter *it = getByOffset(rose, iter_offset);
+    assert(ISALIGNED(it));
+
+    u32 idx = 0;
+    struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES];
+
+    for (u32 qi = mmbit_sparse_iter_begin(aa, aaCount, &idx, it, si_state);
+         qi != MMB_INVALID;
+         qi = mmbit_sparse_iter_next(aa, aaCount, qi, &idx, it, si_state)) {
+        DEBUG_PRINTF("checking nfa %u\n", qi);
+        struct mq *q = scratch->queues + qi;
+        if (!fatbit_set(aqa, qCount, qi)) {
+            initQueue(q, qi, rose, scratch);
+        }
+
+        assert(q->nfa == getNfaByQueue(rose, qi));
+        assert(nfaAcceptsEod(q->nfa));
+
+        if (is_streaming) {
+            // Decompress stream state.
+            nfaExpandState(q->nfa, q->state, q->streamState, offset, key);
+        }
+
+        if (nfaCheckFinalState(q->nfa, q->state, q->streamState, offset,
+                               roseReportAdaptor,
+                               scratch) == MO_HALT_MATCHING) {
+            DEBUG_PRINTF("user instructed us to stop\n");
+            return HWLM_TERMINATE_MATCHING;
+        }
+    }
+
+    return HWLM_CONTINUE_MATCHING;
+}
+
+static rose_inline
+hwlmcb_rv_t roseSuffixesEod(const struct RoseEngine *rose,
+                            struct hs_scratch *scratch, u64a offset) {
+    const u8 *aa = getActiveLeafArray(rose, scratch->core_info.state);
+    const u32 aaCount = rose->activeArrayCount;
+
+    for (u32 qi = mmbit_iterate(aa, aaCount, MMB_INVALID); qi != MMB_INVALID;
+         qi = mmbit_iterate(aa, aaCount, qi)) {
+        DEBUG_PRINTF("checking nfa %u\n", qi);
+        struct mq *q = scratch->queues + qi;
+        assert(q->nfa == getNfaByQueue(rose, qi));
+        assert(nfaAcceptsEod(q->nfa));
+
+        /* We have just been triggered. */
+        assert(fatbit_isset(scratch->aqa, rose->queueCount, qi));
+
+        pushQueueNoMerge(q, MQE_END, scratch->core_info.len);
+        q->context = NULL;
+
+        /* rose exec is used as we don't want to / can't raise matches in the
+         * history buffer. */
+        if (!nfaQueueExecRose(q->nfa, q, MO_INVALID_IDX)) {
+            DEBUG_PRINTF("nfa is dead\n");
+            continue;
+        }
+        if (nfaCheckFinalState(q->nfa, q->state, q->streamState, offset,
+                               roseReportAdaptor,
+                               scratch) == MO_HALT_MATCHING) {
+            DEBUG_PRINTF("user instructed us to stop\n");
+            return HWLM_TERMINATE_MATCHING;
+        }
+    }
+    return HWLM_CONTINUE_MATCHING;
+}
+
+static rose_inline
+hwlmcb_rv_t roseMatcherEod(const struct RoseEngine *rose,
+                           struct hs_scratch *scratch, u64a offset) {
+    assert(rose->ematcherOffset);
+    assert(rose->ematcherRegionSize);
+
+    // Clear role state and active engines, since we have already handled all
+    // outstanding work there.
+    DEBUG_PRINTF("clear role state and active leaf array\n");
+    char *state = scratch->core_info.state;
+    mmbit_clear(getRoleState(state), rose->rolesWithStateCount);
+    mmbit_clear(getActiveLeafArray(rose, state), rose->activeArrayCount);
+
+    const char is_streaming = rose->mode != HS_MODE_BLOCK;
+
+    size_t eod_len;
+    const u8 *eod_data;
+    if (!is_streaming) { /* Block */
+        eod_data = scratch->core_info.buf;
+        eod_len = scratch->core_info.len;
+    } else { /* Streaming */
+        eod_len = scratch->core_info.hlen;
+        eod_data = scratch->core_info.hbuf;
+    }
+
+    assert(eod_data);
+    assert(eod_len);
+
+    DEBUG_PRINTF("%zu bytes of eod data to scan at offset %llu\n", eod_len,
+                 offset);
+
+    // If we don't have enough bytes to produce a match from an EOD table scan,
+    // there's no point scanning.
+    if (eod_len < rose->eodmatcherMinWidth) {
+        DEBUG_PRINTF("too short for min width %u\n", rose->eodmatcherMinWidth);
+        return HWLM_CONTINUE_MATCHING;
+    }
+
+    // Ensure that we only need scan the last N bytes, where N is the length of
+    // the eod-anchored matcher region.
+    size_t adj = eod_len - MIN(eod_len, rose->ematcherRegionSize);
+
+    const struct HWLM *etable = getByOffset(rose, rose->ematcherOffset);
+    hwlmExec(etable, eod_data, eod_len, adj, roseCallback, scratch,
+             scratch->tctxt.groups);
+
+    // We may need to fire delayed matches.
+    if (cleanUpDelayed(rose, scratch, 0, offset) == HWLM_TERMINATE_MATCHING) {
+        DEBUG_PRINTF("user instructed us to stop\n");
+        return HWLM_TERMINATE_MATCHING;
+    }
+
+    roseFlushLastByteHistory(rose, scratch, offset);
+    return HWLM_CONTINUE_MATCHING;
+}
+
+static rose_inline
+int roseCheckLongLiteral(const struct RoseEngine *t,
+                         const struct hs_scratch *scratch, u64a end,
+                         u32 lit_offset, u32 lit_length, char nocase) {
+    const struct core_info *ci = &scratch->core_info;
+    const u8 *lit = getByOffset(t, lit_offset);
+
+    DEBUG_PRINTF("check lit at %llu, length %u\n", end, lit_length);
+    DEBUG_PRINTF("base buf_offset=%llu\n", ci->buf_offset);
+
+    if (end < lit_length) {
+        DEBUG_PRINTF("too short!\n");
+        return 0;
+    }
+
+    // If any portion of the literal matched in the current buffer, check it.
+    if (end > ci->buf_offset) {
+        u32 scan_len = MIN(end - ci->buf_offset, lit_length);
+        u64a scan_start = end - ci->buf_offset - scan_len;
+        DEBUG_PRINTF("checking suffix (%u bytes) in buf[%llu:%llu]\n", scan_len,
+                     scan_start, end);
+        if (cmpForward(ci->buf + scan_start, lit + lit_length - scan_len,
+                       scan_len, nocase)) {
+            DEBUG_PRINTF("cmp of suffix failed\n");
+            return 0;
+        }
+    }
+
+    // If the entirety of the literal was in the current block, we are done.
+    if (end - lit_length >= ci->buf_offset) {
+        DEBUG_PRINTF("literal confirmed in current block\n");
+        return 1;
+    }
+
+    // We still have a prefix which we must test against the buffer prepared by
+    // the long literal table. This is only done in streaming mode.
+
+    assert(t->mode != HS_MODE_BLOCK);
+
+    const u8 *ll_buf;
+    size_t ll_len;
+    if (nocase) {
+        ll_buf = scratch->tctxt.ll_buf_nocase;
+        ll_len = scratch->tctxt.ll_len_nocase;
+    } else {
+        ll_buf = scratch->tctxt.ll_buf;
+        ll_len = scratch->tctxt.ll_len;
+    }
+
+    assert(ll_buf);
+
+    u64a lit_start_offset = end - lit_length;
+    u32 prefix_len = MIN(lit_length, ci->buf_offset - lit_start_offset);
+    u32 hist_rewind = ci->buf_offset - lit_start_offset;
+    DEBUG_PRINTF("ll_len=%zu, hist_rewind=%u\n", ll_len, hist_rewind);
+    if (hist_rewind > ll_len) {
+        DEBUG_PRINTF("not enough history\n");
+        return 0;
+    }
+
+    DEBUG_PRINTF("check prefix len=%u from hist (len %zu, rewind %u)\n",
+                 prefix_len, ll_len, hist_rewind);
+    assert(hist_rewind <= ll_len);
+    if (cmpForward(ll_buf + ll_len - hist_rewind, lit, prefix_len, nocase)) {
+        DEBUG_PRINTF("cmp of prefix failed\n");
+        return 0;
+    }
+
+    DEBUG_PRINTF("cmp succeeded\n");
+    return 1;
+}
+
+static rose_inline
+int roseCheckMediumLiteral(const struct RoseEngine *t,
+                           const struct hs_scratch *scratch, u64a end,
+                           u32 lit_offset, u32 lit_length, char nocase) {
+    const struct core_info *ci = &scratch->core_info;
+    const u8 *lit = getByOffset(t, lit_offset);
+
+    DEBUG_PRINTF("check lit at %llu, length %u\n", end, lit_length);
+    DEBUG_PRINTF("base buf_offset=%llu\n", ci->buf_offset);
+
+    if (end < lit_length) {
+        DEBUG_PRINTF("too short!\n");
+        return 0;
+    }
+
+    // If any portion of the literal matched in the current buffer, check it.
+    if (end > ci->buf_offset) {
+        u32 scan_len = MIN(end - ci->buf_offset, lit_length);
+        u64a scan_start = end - ci->buf_offset - scan_len;
+        DEBUG_PRINTF("checking suffix (%u bytes) in buf[%llu:%llu]\n", scan_len,
+                     scan_start, end);
+        if (cmpForward(ci->buf + scan_start, lit + lit_length - scan_len,
+                       scan_len, nocase)) {
+            DEBUG_PRINTF("cmp of suffix failed\n");
+            return 0;
+        }
+    }
+
+    // If the entirety of the literal was in the current block, we are done.
+    if (end - lit_length >= ci->buf_offset) {
+        DEBUG_PRINTF("literal confirmed in current block\n");
+        return 1;
+    }
+
+    // We still have a prefix which we must test against the history buffer.
+    assert(t->mode != HS_MODE_BLOCK);
+
+    u64a lit_start_offset = end - lit_length;
+    u32 prefix_len = MIN(lit_length, ci->buf_offset - lit_start_offset);
+    u32 hist_rewind = ci->buf_offset - lit_start_offset;
+    DEBUG_PRINTF("hlen=%zu, hist_rewind=%u\n", ci->hlen, hist_rewind);
+
+    // History length check required for confirm in the EOD and delayed
+    // rebuild paths.
+    if (hist_rewind > ci->hlen) {
+        DEBUG_PRINTF("not enough history\n");
+        return 0;
+    }
+
+    DEBUG_PRINTF("check prefix len=%u from hist (len %zu, rewind %u)\n",
+                 prefix_len, ci->hlen, hist_rewind);
+    assert(hist_rewind <= ci->hlen);
+    if (cmpForward(ci->hbuf + ci->hlen - hist_rewind, lit, prefix_len,
+                   nocase)) {
+        DEBUG_PRINTF("cmp of prefix failed\n");
+        return 0;
+    }
+
+    DEBUG_PRINTF("cmp succeeded\n");
+    return 1;
+}
+
+static
+void updateSeqPoint(struct RoseContext *tctxt, u64a offset,
+                    const char from_mpv) {
+    if (from_mpv) {
+        updateMinMatchOffsetFromMpv(tctxt, offset);
+    } else {
+        updateMinMatchOffset(tctxt, offset);
+    }
+}
+
+static rose_inline
+hwlmcb_rv_t flushActiveCombinations(const struct RoseEngine *t,
+                                    struct hs_scratch *scratch) {
+    u8 *cvec = (u8 *)scratch->core_info.combVector;
+    if (!mmbit_any(cvec, t->ckeyCount)) {
+        return HWLM_CONTINUE_MATCHING;
+    }
+    u64a end = scratch->tctxt.lastCombMatchOffset;
+    for (u32 i = mmbit_iterate(cvec, t->ckeyCount, MMB_INVALID);
+         i != MMB_INVALID; i = mmbit_iterate(cvec, t->ckeyCount, i)) {
+        const struct CombInfo *combInfoMap = (const struct CombInfo *)
+            ((const char *)t + t->combInfoMapOffset);
+        const struct CombInfo *ci = combInfoMap + i;
+        if ((ci->min_offset != 0) && (end < ci->min_offset)) {
+            DEBUG_PRINTF("halt: before min_offset=%llu\n", ci->min_offset);
+            continue;
+        }
+        if ((ci->max_offset != MAX_OFFSET) && (end > ci->max_offset)) {
+            DEBUG_PRINTF("halt: after max_offset=%llu\n", ci->max_offset);
+            continue;
+        }
+
+        DEBUG_PRINTF("check ekey %u\n", ci->ekey);
+        if (ci->ekey != INVALID_EKEY) {
+            assert(ci->ekey < t->ekeyCount);
+            const char *evec = scratch->core_info.exhaustionVector;
+            if (isExhausted(t, evec, ci->ekey)) {
+                DEBUG_PRINTF("ekey %u already set, match is exhausted\n",
+                             ci->ekey);
+                continue;
+            }
+        }
+
+        DEBUG_PRINTF("check ckey %u\n", i);
+        char *lvec = scratch->core_info.logicalVector;
+        if (!isLogicalCombination(t, lvec, ci->start, ci->result)) {
+            DEBUG_PRINTF("Logical Combination Failed!\n");
+            continue;
+        }
+
+        DEBUG_PRINTF("Logical Combination Passed!\n");
+        if (roseReportComb(t, scratch, end, ci->id, 0,
+                           ci->ekey) == HWLM_TERMINATE_MATCHING) {
+            return HWLM_TERMINATE_MATCHING;
+        }
+    }
+    clearCvec(t, (char *)cvec);
+    return HWLM_CONTINUE_MATCHING;
+}
+
+static rose_inline
+hwlmcb_rv_t checkPurelyNegatives(const struct RoseEngine *t,
+                                 struct hs_scratch *scratch, u64a end) {
+    for (u32 i = 0; i < t->ckeyCount; i++) {
+        const struct CombInfo *combInfoMap = (const struct CombInfo *)
+            ((const char *)t + t->combInfoMapOffset);
+        const struct CombInfo *ci = combInfoMap + i;
+        if ((ci->min_offset != 0) && (end < ci->min_offset)) {
+            DEBUG_PRINTF("halt: before min_offset=%llu\n", ci->min_offset);
+            continue;
+        }
+        if ((ci->max_offset != MAX_OFFSET) && (end > ci->max_offset)) {
+            DEBUG_PRINTF("halt: after max_offset=%llu\n", ci->max_offset);
+            continue;
+        }
+
+        DEBUG_PRINTF("check ekey %u\n", ci->ekey);
+        if (ci->ekey != INVALID_EKEY) {
+            assert(ci->ekey < t->ekeyCount);
+            const char *evec = scratch->core_info.exhaustionVector;
+            if (isExhausted(t, evec, ci->ekey)) {
+                DEBUG_PRINTF("ekey %u already set, match is exhausted\n",
+                             ci->ekey);
+                continue;
+            }
+        }
+
+        DEBUG_PRINTF("check ckey %u purely negative\n", i);
+        char *lvec = scratch->core_info.logicalVector;
+        if (!isPurelyNegativeMatch(t, lvec, ci->start, ci->result)) {
+            DEBUG_PRINTF("Logical Combination from purely negative Failed!\n");
+            continue;
+        }
+
+        DEBUG_PRINTF("Logical Combination from purely negative Passed!\n");
+        if (roseReportComb(t, scratch, end, ci->id, 0,
+                           ci->ekey) == HWLM_TERMINATE_MATCHING) {
+            return HWLM_TERMINATE_MATCHING;
+        }
+    }
+    return HWLM_CONTINUE_MATCHING;
+}
+
+#if !defined(_WIN32)
+#define PROGRAM_CASE(name)                                                     \
+    case ROSE_INSTR_##name: {                                                  \
+    LABEL_ROSE_INSTR_##name:                                                   \
+        DEBUG_PRINTF("instruction: " #name " (pc=%u)\n",                       \
+                     programOffset + (u32)(pc - pc_base));                     \
+        const struct ROSE_STRUCT_##name *ri =                                  \
+            (const struct ROSE_STRUCT_##name *)pc;
+
+#define PROGRAM_NEXT_INSTRUCTION                                               \
+    pc += ROUNDUP_N(sizeof(*ri), ROSE_INSTR_MIN_ALIGN);                        \
+    goto *(next_instr[*(const u8 *)pc]);                                       \
+    }
+
+#define PROGRAM_NEXT_INSTRUCTION_JUMP                                          \
+    goto *(next_instr[*(const u8 *)pc]);
+#else
+#define PROGRAM_CASE(name)                                                     \
+    case ROSE_INSTR_##name: {                                                  \
+        DEBUG_PRINTF("instruction: " #name " (pc=%u)\n",                       \
+                     programOffset + (u32)(pc - pc_base));                     \
+        const struct ROSE_STRUCT_##name *ri =                                  \
+            (const struct ROSE_STRUCT_##name *)pc;
+
+#define PROGRAM_NEXT_INSTRUCTION                                               \
+    pc += ROUNDUP_N(sizeof(*ri), ROSE_INSTR_MIN_ALIGN);                        \
+    break;                                                                     \
+    }
+
+#define PROGRAM_NEXT_INSTRUCTION_JUMP continue;
+#endif
+
+hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t,
+                           struct hs_scratch *scratch, u32 programOffset,
+                           u64a som, u64a end, u8 prog_flags) {
+    DEBUG_PRINTF("program=%u, offsets [%llu,%llu], flags=%u\n", programOffset,
+                 som, end, prog_flags);
+
+    if (programOffset != ROSE_INVALID_PROG_OFFSET)
+        assert(programOffset >= sizeof(struct RoseEngine));
+    assert(programOffset < t->size);
+
+    const char in_anchored = prog_flags & ROSE_PROG_FLAG_IN_ANCHORED;
+    const char in_catchup = prog_flags & ROSE_PROG_FLAG_IN_CATCHUP;
+    const char from_mpv = prog_flags & ROSE_PROG_FLAG_FROM_MPV;
+    const char skip_mpv_catchup = prog_flags & ROSE_PROG_FLAG_SKIP_MPV_CATCHUP;
+
+    const char *pc_base = getByOffset(t, programOffset);
+    const char *pc = pc_base;
+
+    // Local sparse iterator state for programs that use the SPARSE_ITER_BEGIN
+    // and SPARSE_ITER_NEXT instructions.
+    struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES];
+
+    // If this program has an effect, work_done will be set to one (which may
+    // allow the program to squash groups).
+    int work_done = 0;
+
+    struct RoseContext *tctxt = &scratch->tctxt;
+
+#if !defined(_WIN32)
+    static const void *next_instr[] = {
+        &&LABEL_ROSE_INSTR_END,               //!< End of program.
+        &&LABEL_ROSE_INSTR_ANCHORED_DELAY,    //!< Delay until after anchored matcher.
+        &&LABEL_ROSE_INSTR_CHECK_LIT_EARLY,   //!< Skip matches before floating min offset.
+        &&LABEL_ROSE_INSTR_CHECK_GROUPS,      //!< Check that literal groups are on.
+        &&LABEL_ROSE_INSTR_CHECK_ONLY_EOD,    //!< Role matches only at EOD.
+        &&LABEL_ROSE_INSTR_CHECK_BOUNDS,      //!< Bounds on distance from offset 0.
+        &&LABEL_ROSE_INSTR_CHECK_NOT_HANDLED, //!< Test & set role in "handled".
+        &&LABEL_ROSE_INSTR_CHECK_SINGLE_LOOKAROUND, //!< Single lookaround check.
+        &&LABEL_ROSE_INSTR_CHECK_LOOKAROUND,  //!< Lookaround check.
+        &&LABEL_ROSE_INSTR_CHECK_MASK,        //!< 8-bytes mask check.
+        &&LABEL_ROSE_INSTR_CHECK_MASK_32,     //!< 32-bytes and/cmp/neg mask check.
+        &&LABEL_ROSE_INSTR_CHECK_BYTE,        //!< Single Byte check.
+        &&LABEL_ROSE_INSTR_CHECK_SHUFTI_16x8, //!< Check 16-byte data by 8-bucket shufti.
+        &&LABEL_ROSE_INSTR_CHECK_SHUFTI_32x8, //!< Check 32-byte data by 8-bucket shufti.
+        &&LABEL_ROSE_INSTR_CHECK_SHUFTI_16x16, //!< Check 16-byte data by 16-bucket shufti.
+        &&LABEL_ROSE_INSTR_CHECK_SHUFTI_32x16, //!< Check 32-byte data by 16-bucket shufti.
+        &&LABEL_ROSE_INSTR_CHECK_INFIX,       //!< Infix engine must be in accept state.
+        &&LABEL_ROSE_INSTR_CHECK_PREFIX,      //!< Prefix engine must be in accept state.
+        &&LABEL_ROSE_INSTR_PUSH_DELAYED,      //!< Push delayed literal matches.
+        &&LABEL_ROSE_INSTR_DUMMY_NOP,         //!< NOP. Should not exist in build programs.
+        &&LABEL_ROSE_INSTR_CATCH_UP,          //!< Catch up engines, anchored matches.
+        &&LABEL_ROSE_INSTR_CATCH_UP_MPV,      //!< Catch up the MPV.
+        &&LABEL_ROSE_INSTR_SOM_ADJUST,        //!< Set SOM from a distance to EOM.
+        &&LABEL_ROSE_INSTR_SOM_LEFTFIX,       //!< Acquire SOM from a leftfix engine.
+        &&LABEL_ROSE_INSTR_SOM_FROM_REPORT,   //!< Acquire SOM from a som_operation.
+        &&LABEL_ROSE_INSTR_SOM_ZERO,          //!< Set SOM to zero.
+        &&LABEL_ROSE_INSTR_TRIGGER_INFIX,     //!< Trigger an infix engine.
+        &&LABEL_ROSE_INSTR_TRIGGER_SUFFIX,    //!< Trigger a suffix engine.
+        &&LABEL_ROSE_INSTR_DEDUPE,            //!< Run deduplication for report.
+        &&LABEL_ROSE_INSTR_DEDUPE_SOM,        //!< Run deduplication for SOM report.
+        &&LABEL_ROSE_INSTR_REPORT_CHAIN,      //!< Fire a chained report (MPV).
+        &&LABEL_ROSE_INSTR_REPORT_SOM_INT,    //!< Manipulate SOM only.
+        &&LABEL_ROSE_INSTR_REPORT_SOM_AWARE,  //!< Manipulate SOM from SOM-aware source.
+        &&LABEL_ROSE_INSTR_REPORT,
+        &&LABEL_ROSE_INSTR_REPORT_EXHAUST,
+        &&LABEL_ROSE_INSTR_REPORT_SOM,
+        &&LABEL_ROSE_INSTR_REPORT_SOM_EXHAUST,
+        &&LABEL_ROSE_INSTR_DEDUPE_AND_REPORT,
+        &&LABEL_ROSE_INSTR_FINAL_REPORT,
+        &&LABEL_ROSE_INSTR_CHECK_EXHAUSTED,   //!< Check if an ekey has already been set.
+        &&LABEL_ROSE_INSTR_CHECK_MIN_LENGTH,  //!< Check (EOM - SOM) against min length.
+        &&LABEL_ROSE_INSTR_SET_STATE,         //!< Switch a state index on.
+        &&LABEL_ROSE_INSTR_SET_GROUPS,        //!< Set some literal group bits.
+        &&LABEL_ROSE_INSTR_SQUASH_GROUPS,     //!< Conditionally turn off some groups.
+        &&LABEL_ROSE_INSTR_CHECK_STATE,       //!< Test a single bit in the state multibit.
+        &&LABEL_ROSE_INSTR_SPARSE_ITER_BEGIN, //!< Begin running a sparse iter over states.
+        &&LABEL_ROSE_INSTR_SPARSE_ITER_NEXT,  //!< Continue running sparse iter over states.
+        &&LABEL_ROSE_INSTR_SPARSE_ITER_ANY,   //!< Test for any bit in the sparse iterator.
+        &&LABEL_ROSE_INSTR_ENGINES_EOD,
+        &&LABEL_ROSE_INSTR_SUFFIXES_EOD,
+        &&LABEL_ROSE_INSTR_MATCHER_EOD,
+        &&LABEL_ROSE_INSTR_CHECK_LONG_LIT,
+        &&LABEL_ROSE_INSTR_CHECK_LONG_LIT_NOCASE,
+        &&LABEL_ROSE_INSTR_CHECK_MED_LIT,
+        &&LABEL_ROSE_INSTR_CHECK_MED_LIT_NOCASE,
+        &&LABEL_ROSE_INSTR_CLEAR_WORK_DONE,
+        &&LABEL_ROSE_INSTR_MULTIPATH_LOOKAROUND,
+        &&LABEL_ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_16x8,
+        &&LABEL_ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_32x8,
+        &&LABEL_ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_32x16,
+        &&LABEL_ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_64,
+        &&LABEL_ROSE_INSTR_INCLUDED_JUMP,
+        &&LABEL_ROSE_INSTR_SET_LOGICAL,
+        &&LABEL_ROSE_INSTR_SET_COMBINATION,
+        &&LABEL_ROSE_INSTR_FLUSH_COMBINATION,
+        &&LABEL_ROSE_INSTR_SET_EXHAUST,
+        &&LABEL_ROSE_INSTR_LAST_FLUSH_COMBINATION
+#ifdef HAVE_AVX512
+        ,
+        &&LABEL_ROSE_INSTR_CHECK_SHUFTI_64x8, //!< Check 64-byte data by 8-bucket shufti.
+        &&LABEL_ROSE_INSTR_CHECK_SHUFTI_64x16, //!< Check 64-byte data by 16-bucket shufti.
+        &&LABEL_ROSE_INSTR_CHECK_MASK_64     //!< 64-bytes and/cmp/neg mask check.
+#endif
+    };
+#endif
+
+    for (;;) {
+        assert(ISALIGNED_N(pc, ROSE_INSTR_MIN_ALIGN));
+        assert(pc >= pc_base);
+        assert((size_t)(pc - pc_base) < t->size);
+        const u8 code = *(const u8 *)pc;
+        assert(code <= LAST_ROSE_INSTRUCTION);
+
+        switch ((enum RoseInstructionCode)code) {
+            PROGRAM_CASE(END) {
+                DEBUG_PRINTF("finished\n");
+                return HWLM_CONTINUE_MATCHING;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(ANCHORED_DELAY) {
+                if (in_anchored && end > t->floatingMinLiteralMatchOffset) {
+                    DEBUG_PRINTF("delay until playback\n");
+                    tctxt->groups |= ri->groups;
+                    work_done = 1;
+                    recordAnchoredLiteralMatch(t, scratch, ri->anch_id, end);
+
+                    assert(ri->done_jump); // must progress
+                    pc += ri->done_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_LIT_EARLY) {
+                if (end < ri->min_offset) {
+                    DEBUG_PRINTF("halt: before min_offset=%u\n",
+                                 ri->min_offset);
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_GROUPS) {
+                DEBUG_PRINTF("groups=0x%llx, checking instr groups=0x%llx\n",
+                             tctxt->groups, ri->groups);
+                if (!(ri->groups & tctxt->groups)) {
+                    DEBUG_PRINTF("halt: no groups are set\n");
+                    return HWLM_CONTINUE_MATCHING;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_ONLY_EOD) {
+                struct core_info *ci = &scratch->core_info;
+                if (end != ci->buf_offset + ci->len) {
+                    DEBUG_PRINTF("should only match at end of data\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_BOUNDS) {
+                if (!roseCheckBounds(end, ri->min_bound, ri->max_bound)) {
+                    DEBUG_PRINTF("failed bounds check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_NOT_HANDLED) {
+                struct fatbit *handled = scratch->handled_roles;
+                if (fatbit_set(handled, t->handledKeyCount, ri->key)) {
+                    DEBUG_PRINTF("key %u already set\n", ri->key);
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_SINGLE_LOOKAROUND) {
+                if (!roseCheckSingleLookaround(t, scratch, ri->offset,
+                                               ri->reach_index, end)) {
+                    DEBUG_PRINTF("failed lookaround check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_LOOKAROUND) {
+                if (!roseCheckLookaround(t, scratch, ri->look_index,
+                                         ri->reach_index, ri->count, end)) {
+                    DEBUG_PRINTF("failed lookaround check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_MASK) {
+                struct core_info *ci = &scratch->core_info;
+                if (!roseCheckMask(ci, ri->and_mask, ri->cmp_mask,
+                                   ri->neg_mask, ri->offset, end)) {
+                    DEBUG_PRINTF("failed mask check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_MASK_32) {
+                struct core_info *ci = &scratch->core_info;
+                if (!roseCheckMask32(ci, ri->and_mask, ri->cmp_mask,
+                                     ri->neg_mask, ri->offset, end)) {
+                    assert(ri->fail_jump);
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_BYTE) {
+                const struct core_info *ci = &scratch->core_info;
+                if (!roseCheckByte(ci, ri->and_mask, ri->cmp_mask,
+                                   ri->negation, ri->offset, end)) {
+                    DEBUG_PRINTF("failed byte check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_SHUFTI_16x8) {
+                const struct core_info *ci = &scratch->core_info;
+                if (!roseCheckShufti16x8(ci, ri->nib_mask,
+                                         ri->bucket_select_mask,
+                                         ri->neg_mask, ri->offset, end)) {
+                    assert(ri->fail_jump);
+                    pc += ri-> fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_SHUFTI_32x8) {
+                const struct core_info *ci = &scratch->core_info;
+                if (!roseCheckShufti32x8(ci, ri->hi_mask, ri->lo_mask,
+                                         ri->bucket_select_mask,
+                                         ri->neg_mask, ri->offset, end)) {
+                    assert(ri->fail_jump);
+                    pc += ri-> fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_SHUFTI_16x16) {
+                const struct core_info *ci = &scratch->core_info;
+                if (!roseCheckShufti16x16(ci, ri->hi_mask, ri->lo_mask,
+                                          ri->bucket_select_mask,
+                                          ri->neg_mask, ri->offset, end)) {
+                    assert(ri->fail_jump);
+                    pc += ri-> fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_SHUFTI_32x16) {
+                const struct core_info *ci = &scratch->core_info;
+                if (!roseCheckShufti32x16(ci, ri->hi_mask, ri->lo_mask,
+                                          ri->bucket_select_mask_hi,
+                                          ri->bucket_select_mask_lo,
+                                          ri->neg_mask, ri->offset, end)) {
+                    assert(ri->fail_jump);
+                    pc += ri-> fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+#ifdef HAVE_AVX512
+            PROGRAM_CASE(CHECK_MASK_64) {
+                struct core_info *ci = &scratch->core_info;
+                if (!roseCheckMask64(ci, ri->and_mask, ri->cmp_mask,
+                                     ri->neg_mask, ri->offset, end)) {
+                    assert(ri->fail_jump);
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_SHUFTI_64x8) {
+                const struct core_info *ci = &scratch->core_info;
+                if (!roseCheckShufti64x8(ci, ri->hi_mask, ri->lo_mask,
+                                         ri->bucket_select_mask,
+                                         ri->neg_mask, ri->offset, end)) {
+                    assert(ri->fail_jump);
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_SHUFTI_64x16) {
+                const struct core_info *ci = &scratch->core_info;
+                if (!roseCheckShufti64x16(ci, ri->hi_mask_1, ri->hi_mask_2,
+                                          ri->lo_mask_1, ri->lo_mask_2,
+                                          ri->bucket_select_mask_hi,
+                                          ri->bucket_select_mask_lo,
+                                          ri->neg_mask, ri->offset, end)) {
+                    assert(ri->fail_jump);
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+#endif
+
+            PROGRAM_CASE(CHECK_INFIX) {
+                if (!roseTestInfix(t, scratch, ri->queue, ri->lag, ri->report,
+                                   end)) {
+                    DEBUG_PRINTF("failed infix check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_PREFIX) {
+                if (!roseTestPrefix(t, scratch, ri->queue, ri->lag, ri->report,
+                                    end)) {
+                    DEBUG_PRINTF("failed prefix check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(PUSH_DELAYED) {
+                rosePushDelayedMatch(t, scratch, ri->delay, ri->index, end);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(DUMMY_NOP) {
+                assert(0);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CATCH_UP) {
+                if (roseCatchUpTo(t, scratch, end) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CATCH_UP_MPV) {
+                if (from_mpv || skip_mpv_catchup) {
+                    DEBUG_PRINTF("skipping mpv catchup\n");
+                } else if (roseCatchUpMPV(t,
+                                          end - scratch->core_info.buf_offset,
+                                          scratch) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SOM_ADJUST) {
+                assert(ri->distance <= end);
+                som = end - ri->distance;
+                DEBUG_PRINTF("som is (end - %u) = %llu\n", ri->distance, som);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SOM_LEFTFIX) {
+                som = roseGetHaigSom(t, scratch, ri->queue, ri->lag);
+                DEBUG_PRINTF("som from leftfix is %llu\n", som);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SOM_FROM_REPORT) {
+                som = handleSomExternal(scratch, &ri->som, end);
+                DEBUG_PRINTF("som from report %u is %llu\n", ri->som.onmatch,
+                             som);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SOM_ZERO) {
+                DEBUG_PRINTF("setting SOM to zero\n");
+                som = 0;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(TRIGGER_INFIX) {
+                roseTriggerInfix(t, scratch, som, end, ri->queue, ri->event,
+                                 ri->cancel);
+                work_done = 1;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(TRIGGER_SUFFIX) {
+                if (roseTriggerSuffix(t, scratch, ri->queue, ri->event, som,
+                                      end) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                work_done = 1;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(DEDUPE) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                const char do_som = t->hasSom; // TODO: constant propagate
+                const char is_external_report = 1;
+                enum DedupeResult rv =
+                    dedupeCatchup(t, scratch, end, som, end + ri->offset_adjust,
+                                  ri->dkey, ri->offset_adjust,
+                                  is_external_report, ri->quash_som, do_som);
+                switch (rv) {
+                case DEDUPE_HALT:
+                    return HWLM_TERMINATE_MATCHING;
+                case DEDUPE_SKIP:
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                case DEDUPE_CONTINUE:
+                    break;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(DEDUPE_SOM) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                const char is_external_report = 0;
+                const char do_som = 1;
+                enum DedupeResult rv =
+                    dedupeCatchup(t, scratch, end, som, end + ri->offset_adjust,
+                                  ri->dkey, ri->offset_adjust,
+                                  is_external_report, ri->quash_som, do_som);
+                switch (rv) {
+                case DEDUPE_HALT:
+                    return HWLM_TERMINATE_MATCHING;
+                case DEDUPE_SKIP:
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                case DEDUPE_CONTINUE:
+                    break;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(REPORT_CHAIN) {
+                // Note: sequence points updated inside this function.
+                if (roseCatchUpAndHandleChainMatch(
+                        t, scratch, ri->event, ri->top_squash_distance, end,
+                        in_catchup) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                work_done = 1;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(REPORT_SOM_INT) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                roseHandleSom(scratch, &ri->som, end);
+                work_done = 1;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(REPORT_SOM_AWARE) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                roseHandleSomSom(scratch, &ri->som, som, end);
+                work_done = 1;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(REPORT) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                if (roseReport(t, scratch, end, ri->onmatch, ri->offset_adjust,
+                               INVALID_EKEY) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                work_done = 1;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(REPORT_EXHAUST) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                if (roseReport(t, scratch, end, ri->onmatch, ri->offset_adjust,
+                               ri->ekey) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                work_done = 1;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(REPORT_SOM) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                if (roseReportSom(t, scratch, som, end, ri->onmatch,
+                                  ri->offset_adjust,
+                                  INVALID_EKEY) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                work_done = 1;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(REPORT_SOM_EXHAUST) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                if (roseReportSom(t, scratch, som, end, ri->onmatch,
+                                  ri->offset_adjust,
+                                  ri->ekey) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                work_done = 1;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(DEDUPE_AND_REPORT) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                const char do_som = t->hasSom; // TODO: constant propagate
+                const char is_external_report = 1;
+                enum DedupeResult rv =
+                    dedupeCatchup(t, scratch, end, som, end + ri->offset_adjust,
+                                  ri->dkey, ri->offset_adjust,
+                                  is_external_report, ri->quash_som, do_som);
+                switch (rv) {
+                case DEDUPE_HALT:
+                    return HWLM_TERMINATE_MATCHING;
+                case DEDUPE_SKIP:
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                case DEDUPE_CONTINUE:
+                    break;
+                }
+
+                const u32 ekey = INVALID_EKEY;
+                if (roseReport(t, scratch, end, ri->onmatch, ri->offset_adjust,
+                               ekey) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                work_done = 1;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(FINAL_REPORT) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                if (roseReport(t, scratch, end, ri->onmatch, ri->offset_adjust,
+                               INVALID_EKEY) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                /* One-shot specialisation: this instruction always terminates
+                 * execution of the program. */
+                return HWLM_CONTINUE_MATCHING;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_EXHAUSTED) {
+                DEBUG_PRINTF("check ekey %u\n", ri->ekey);
+                assert(ri->ekey != INVALID_EKEY);
+                assert(ri->ekey < t->ekeyCount);
+                const char *evec = scratch->core_info.exhaustionVector;
+                if (isExhausted(t, evec, ri->ekey)) {
+                    DEBUG_PRINTF("ekey %u already set, match is exhausted\n",
+                                 ri->ekey);
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_MIN_LENGTH) {
+                DEBUG_PRINTF("check min length %llu (adj %d)\n", ri->min_length,
+                             ri->end_adj);
+                assert(ri->min_length > 0);
+                assert(ri->end_adj == 0 || ri->end_adj == -1);
+                assert(som == HS_OFFSET_PAST_HORIZON || som <= end);
+                if (som != HS_OFFSET_PAST_HORIZON &&
+                    ((end + ri->end_adj) - som < ri->min_length)) {
+                    DEBUG_PRINTF("failed check, match len %llu\n",
+                                 (u64a)((end + ri->end_adj) - som));
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SET_STATE) {
+                DEBUG_PRINTF("set state index %u\n", ri->index);
+                mmbit_set(getRoleState(scratch->core_info.state),
+                          t->rolesWithStateCount, ri->index);
+                work_done = 1;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SET_GROUPS) {
+                tctxt->groups |= ri->groups;
+                DEBUG_PRINTF("set groups 0x%llx -> 0x%llx\n", ri->groups,
+                             tctxt->groups);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SQUASH_GROUPS) {
+                assert(popcount64(ri->groups) == 63); // Squash only one group.
+                if (work_done) {
+                    tctxt->groups &= ri->groups;
+                    DEBUG_PRINTF("squash groups 0x%llx -> 0x%llx\n", ri->groups,
+                                 tctxt->groups);
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_STATE) {
+                DEBUG_PRINTF("check state %u\n", ri->index);
+                const u8 *roles = getRoleState(scratch->core_info.state);
+                if (!mmbit_isset(roles, t->rolesWithStateCount, ri->index)) {
+                    DEBUG_PRINTF("state not on\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SPARSE_ITER_BEGIN) {
+                DEBUG_PRINTF("iter_offset=%u\n", ri->iter_offset);
+                const struct mmbit_sparse_iter *it =
+                    getByOffset(t, ri->iter_offset);
+                assert(ISALIGNED(it));
+
+                const u8 *roles = getRoleState(scratch->core_info.state);
+
+                u32 idx = 0;
+                u32 i = mmbit_sparse_iter_begin(roles, t->rolesWithStateCount,
+                                                &idx, it, si_state);
+                if (i == MMB_INVALID) {
+                    DEBUG_PRINTF("no states in sparse iter are on\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+
+                fatbit_clear(scratch->handled_roles);
+
+                const u32 *jumps = getByOffset(t, ri->jump_table);
+                DEBUG_PRINTF("state %u (idx=%u) is on, jump to %u\n", i, idx,
+                             jumps[idx]);
+                pc = pc_base + jumps[idx];
+                PROGRAM_NEXT_INSTRUCTION_JUMP
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SPARSE_ITER_NEXT) {
+                DEBUG_PRINTF("iter_offset=%u, state=%u\n", ri->iter_offset,
+                             ri->state);
+                const struct mmbit_sparse_iter *it =
+                    getByOffset(t, ri->iter_offset);
+                assert(ISALIGNED(it));
+
+                const u8 *roles = getRoleState(scratch->core_info.state);
+
+                u32 idx = 0;
+                u32 i = mmbit_sparse_iter_next(roles, t->rolesWithStateCount,
+                                               ri->state, &idx, it, si_state);
+                if (i == MMB_INVALID) {
+                    DEBUG_PRINTF("no more states in sparse iter are on\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+
+                const u32 *jumps = getByOffset(t, ri->jump_table);
+                DEBUG_PRINTF("state %u (idx=%u) is on, jump to %u\n", i, idx,
+                             jumps[idx]);
+                pc = pc_base + jumps[idx];
+                PROGRAM_NEXT_INSTRUCTION_JUMP
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SPARSE_ITER_ANY) {
+                DEBUG_PRINTF("iter_offset=%u\n", ri->iter_offset);
+                const struct mmbit_sparse_iter *it =
+                    getByOffset(t, ri->iter_offset);
+                assert(ISALIGNED(it));
+
+                const u8 *roles = getRoleState(scratch->core_info.state);
+
+                u32 idx = 0;
+                u32 i = mmbit_sparse_iter_begin(roles, t->rolesWithStateCount,
+                                                &idx, it, si_state);
+                if (i == MMB_INVALID) {
+                    DEBUG_PRINTF("no states in sparse iter are on\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+                DEBUG_PRINTF("state %u (idx=%u) is on\n", i, idx);
+                fatbit_clear(scratch->handled_roles);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(ENGINES_EOD) {
+                if (roseEnginesEod(t, scratch, end, ri->iter_offset) ==
+                    HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SUFFIXES_EOD) {
+                if (roseSuffixesEod(t, scratch, end) ==
+                    HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(MATCHER_EOD) {
+                if (roseMatcherEod(t, scratch, end) ==
+                    HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_LONG_LIT) {
+                const char nocase = 0;
+                if (!roseCheckLongLiteral(t, scratch, end, ri->lit_offset,
+                                          ri->lit_length, nocase)) {
+                    DEBUG_PRINTF("failed long lit check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_LONG_LIT_NOCASE) {
+                const char nocase = 1;
+                if (!roseCheckLongLiteral(t, scratch, end, ri->lit_offset,
+                                          ri->lit_length, nocase)) {
+                    DEBUG_PRINTF("failed nocase long lit check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_MED_LIT) {
+                const char nocase = 0;
+                if (!roseCheckMediumLiteral(t, scratch, end, ri->lit_offset,
+                                            ri->lit_length, nocase)) {
+                    DEBUG_PRINTF("failed lit check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_MED_LIT_NOCASE) {
+                const char nocase = 1;
+                if (!roseCheckMediumLiteral(t, scratch, end, ri->lit_offset,
+                                            ri->lit_length, nocase)) {
+                    DEBUG_PRINTF("failed long lit check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CLEAR_WORK_DONE) {
+                DEBUG_PRINTF("clear work_done flag\n");
+                work_done = 0;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(MULTIPATH_LOOKAROUND) {
+                if (!roseMultipathLookaround(t, scratch, ri->look_index,
+                                             ri->reach_index, ri->count,
+                                             ri->last_start, ri->start_mask,
+                                             end)) {
+                    DEBUG_PRINTF("failed multi-path lookaround check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_MULTIPATH_SHUFTI_16x8) {
+                if (!roseCheckMultipathShufti16x8(scratch, ri, end)) {
+                    DEBUG_PRINTF("failed multi-path shufti 16x8 check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_MULTIPATH_SHUFTI_32x8) {
+                if (!roseCheckMultipathShufti32x8(scratch, ri, end)) {
+                    DEBUG_PRINTF("failed multi-path shufti 32x8 check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_MULTIPATH_SHUFTI_32x16) {
+                if (!roseCheckMultipathShufti32x16(scratch, ri, end)) {
+                    DEBUG_PRINTF("failed multi-path shufti 32x16 check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_MULTIPATH_SHUFTI_64) {
+                if (!roseCheckMultipathShufti64(scratch, ri, end)) {
+                    DEBUG_PRINTF("failed multi-path shufti 64 check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(INCLUDED_JUMP) {
+                if (scratch->fdr_conf) {
+                    // squash the bucket of included literal
+                    u8 shift = scratch->fdr_conf_offset & ~7U;
+                    u64a mask = ((~(u64a)ri->squash) << shift);
+                    *(scratch->fdr_conf) &= mask;
+
+                    pc = getByOffset(t, ri->child_offset);
+                    pc_base = pc;
+                    programOffset = (const u8 *)pc_base -(const u8 *)t;
+                    DEBUG_PRINTF("pc_base %p pc %p child_offset %u squash %u\n",
+                                 pc_base, pc, ri->child_offset, ri->squash);
+                    work_done = 0;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SET_LOGICAL) {
+                DEBUG_PRINTF("set logical value of lkey %u, offset_adjust=%d\n",
+                             ri->lkey, ri->offset_adjust);
+                assert(ri->lkey != INVALID_LKEY);
+                assert(ri->lkey < t->lkeyCount);
+                char *lvec = scratch->core_info.logicalVector;
+                setLogicalVal(t, lvec, ri->lkey, 1);
+                updateLastCombMatchOffset(tctxt, end + ri->offset_adjust);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SET_COMBINATION) {
+                DEBUG_PRINTF("set ckey %u as active\n", ri->ckey);
+                assert(ri->ckey != INVALID_CKEY);
+                assert(ri->ckey < t->ckeyCount);
+                char *cvec = scratch->core_info.combVector;
+                setCombinationActive(t, cvec, ri->ckey);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(FLUSH_COMBINATION) {
+                assert(end >= tctxt->lastCombMatchOffset);
+                if (end > tctxt->lastCombMatchOffset) {
+                    if (flushActiveCombinations(t, scratch)
+                            == HWLM_TERMINATE_MATCHING) {
+                        return HWLM_TERMINATE_MATCHING;
+                    }
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SET_EXHAUST) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                if (roseSetExhaust(t, scratch, ri->ekey)
+                        == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                work_done = 1;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(LAST_FLUSH_COMBINATION) {
+                assert(end >= tctxt->lastCombMatchOffset);
+                if (flushActiveCombinations(t, scratch)
+                        == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                if (checkPurelyNegatives(t, scratch, end)
+                        == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            default: {
+                assert(0); // unreachable
+                scratch->core_info.status |= STATUS_ERROR;
+                return HWLM_TERMINATE_MATCHING;
+            }
+        }
+    }
+
+    assert(0); // unreachable
+    return HWLM_CONTINUE_MATCHING;
+}
+
+#define L_PROGRAM_CASE(name)                                                   \
+    case ROSE_INSTR_##name: {                                                  \
+        DEBUG_PRINTF("l_instruction: " #name " (pc=%u)\n",                     \
+                     programOffset + (u32)(pc - pc_base));                     \
+        const struct ROSE_STRUCT_##name *ri =                                  \
+            (const struct ROSE_STRUCT_##name *)pc;
+
+#define L_PROGRAM_NEXT_INSTRUCTION                                             \
+    pc += ROUNDUP_N(sizeof(*ri), ROSE_INSTR_MIN_ALIGN);                        \
+    break;                                                                     \
+    }
+
+#define L_PROGRAM_NEXT_INSTRUCTION_JUMP continue;
+
+hwlmcb_rv_t roseRunProgram_l(const struct RoseEngine *t,
+                             struct hs_scratch *scratch, u32 programOffset,
+                             u64a som, u64a end, u8 prog_flags) {
+    DEBUG_PRINTF("program=%u, offsets [%llu,%llu], flags=%u\n", programOffset,
+                 som, end, prog_flags);
+
+    assert(programOffset != ROSE_INVALID_PROG_OFFSET);
+    assert(programOffset >= sizeof(struct RoseEngine));
+    assert(programOffset < t->size);
+
+    const char in_catchup = prog_flags & ROSE_PROG_FLAG_IN_CATCHUP;
+    const char from_mpv = prog_flags & ROSE_PROG_FLAG_FROM_MPV;
+
+    const char *pc_base = getByOffset(t, programOffset);
+    const char *pc = pc_base;
+
+    // If this program has an effect, work_done will be set to one (which may
+    // allow the program to squash groups).
+    int work_done = 0;
+
+    struct RoseContext *tctxt = &scratch->tctxt;
+
+    assert(*(const u8 *)pc != ROSE_INSTR_END);
+
+    for (;;) {
+        assert(ISALIGNED_N(pc, ROSE_INSTR_MIN_ALIGN));
+        assert(pc >= pc_base);
+        assert((size_t)(pc - pc_base) < t->size);
+        const u8 code = *(const u8 *)pc;
+        assert(code <= LAST_ROSE_INSTRUCTION);
+
+        switch ((enum RoseInstructionCode)code) {
+            L_PROGRAM_CASE(END) {
+                DEBUG_PRINTF("finished\n");
+                return HWLM_CONTINUE_MATCHING;
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
+            L_PROGRAM_CASE(CHECK_GROUPS) {
+                DEBUG_PRINTF("groups=0x%llx, checking instr groups=0x%llx\n",
+                             tctxt->groups, ri->groups);
+                if (!(ri->groups & tctxt->groups)) {
+                    DEBUG_PRINTF("halt: no groups are set\n");
+                    return HWLM_CONTINUE_MATCHING;
+                }
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
+            L_PROGRAM_CASE(CHECK_MASK) {
+                struct core_info *ci = &scratch->core_info;
+                if (!roseCheckMask(ci, ri->and_mask, ri->cmp_mask,
+                                   ri->neg_mask, ri->offset, end)) {
+                    DEBUG_PRINTF("failed mask check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    L_PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
+            L_PROGRAM_CASE(CHECK_MASK_32) {
+                struct core_info *ci = &scratch->core_info;
+                if (!roseCheckMask32(ci, ri->and_mask, ri->cmp_mask,
+                                     ri->neg_mask, ri->offset, end)) {
+                    assert(ri->fail_jump);
+                    pc += ri->fail_jump;
+                    L_PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
+#ifdef HAVE_AVX512
+            L_PROGRAM_CASE(CHECK_MASK_64) {
+                struct core_info *ci = &scratch->core_info;
+                if (!roseCheckMask64(ci, ri->and_mask, ri->cmp_mask,
+                                     ri->neg_mask, ri->offset, end)) {
+                    assert(ri->fail_jump);
+                    pc += ri->fail_jump;
+                    L_PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+#endif
+
+            L_PROGRAM_CASE(CHECK_BYTE) {
+                const struct core_info *ci = &scratch->core_info;
+                if (!roseCheckByte(ci, ri->and_mask, ri->cmp_mask,
+                                   ri->negation, ri->offset, end)) {
+                    DEBUG_PRINTF("failed byte check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    L_PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
+            L_PROGRAM_CASE(PUSH_DELAYED) {
+                rosePushDelayedMatch(t, scratch, ri->delay, ri->index, end);
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
+            L_PROGRAM_CASE(CATCH_UP) {
+                if (roseCatchUpTo(t, scratch, end) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
+            L_PROGRAM_CASE(SOM_FROM_REPORT) {
+                som = handleSomExternal(scratch, &ri->som, end);
+                DEBUG_PRINTF("som from report %u is %llu\n", ri->som.onmatch,
+                             som);
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
+            L_PROGRAM_CASE(DEDUPE) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                const char do_som = t->hasSom; // TODO: constant propagate
+                const char is_external_report = 1;
+                enum DedupeResult rv =
+                    dedupeCatchup(t, scratch, end, som, end + ri->offset_adjust,
+                                  ri->dkey, ri->offset_adjust,
+                                  is_external_report, ri->quash_som, do_som);
+                switch (rv) {
+                case DEDUPE_HALT:
+                    return HWLM_TERMINATE_MATCHING;
+                case DEDUPE_SKIP:
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    L_PROGRAM_NEXT_INSTRUCTION_JUMP
+                case DEDUPE_CONTINUE:
+                    break;
+                }
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
+            L_PROGRAM_CASE(DEDUPE_SOM) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                const char is_external_report = 0;
+                const char do_som = 1;
+                enum DedupeResult rv =
+                    dedupeCatchup(t, scratch, end, som, end + ri->offset_adjust,
+                                  ri->dkey, ri->offset_adjust,
+                                  is_external_report, ri->quash_som, do_som);
+                switch (rv) {
+                case DEDUPE_HALT:
+                    return HWLM_TERMINATE_MATCHING;
+                case DEDUPE_SKIP:
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    L_PROGRAM_NEXT_INSTRUCTION_JUMP
+                case DEDUPE_CONTINUE:
+                    break;
+                }
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
+            L_PROGRAM_CASE(REPORT_CHAIN) {
+                // Note: sequence points updated inside this function.
+                if (roseCatchUpAndHandleChainMatch(
+                        t, scratch, ri->event, ri->top_squash_distance, end,
+                        in_catchup) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                work_done = 1;
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
+            L_PROGRAM_CASE(REPORT) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                if (roseReport(t, scratch, end, ri->onmatch, ri->offset_adjust,
+                               INVALID_EKEY) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                work_done = 1;
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
+            L_PROGRAM_CASE(REPORT_EXHAUST) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                if (roseReport(t, scratch, end, ri->onmatch, ri->offset_adjust,
+                               ri->ekey) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                work_done = 1;
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
+            L_PROGRAM_CASE(REPORT_SOM) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                if (roseReportSom(t, scratch, som, end, ri->onmatch,
+                                  ri->offset_adjust,
+                                  INVALID_EKEY) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                work_done = 1;
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
+            L_PROGRAM_CASE(DEDUPE_AND_REPORT) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                const char do_som = t->hasSom; // TODO: constant propagate
+                const char is_external_report = 1;
+                enum DedupeResult rv =
+                    dedupeCatchup(t, scratch, end, som, end + ri->offset_adjust,
+                                  ri->dkey, ri->offset_adjust,
+                                  is_external_report, ri->quash_som, do_som);
+                switch (rv) {
+                case DEDUPE_HALT:
+                    return HWLM_TERMINATE_MATCHING;
+                case DEDUPE_SKIP:
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    L_PROGRAM_NEXT_INSTRUCTION_JUMP
+                case DEDUPE_CONTINUE:
+                    break;
+                }
+
+                const u32 ekey = INVALID_EKEY;
+                if (roseReport(t, scratch, end, ri->onmatch, ri->offset_adjust,
+                               ekey) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                work_done = 1;
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
+            L_PROGRAM_CASE(FINAL_REPORT) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                if (roseReport(t, scratch, end, ri->onmatch, ri->offset_adjust,
+                               INVALID_EKEY) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                /* One-shot specialisation: this instruction always terminates
+                 * execution of the program. */
+                return HWLM_CONTINUE_MATCHING;
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
+            L_PROGRAM_CASE(CHECK_EXHAUSTED) {
+                DEBUG_PRINTF("check ekey %u\n", ri->ekey);
+                assert(ri->ekey != INVALID_EKEY);
+                assert(ri->ekey < t->ekeyCount);
+                const char *evec = scratch->core_info.exhaustionVector;
+                if (isExhausted(t, evec, ri->ekey)) {
+                    DEBUG_PRINTF("ekey %u already set, match is exhausted\n",
+                                 ri->ekey);
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    L_PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
+            L_PROGRAM_CASE(SQUASH_GROUPS) {
+                assert(popcount64(ri->groups) == 63); // Squash only one group.
+                if (work_done) {
+                    tctxt->groups &= ri->groups;
+                    DEBUG_PRINTF("squash groups 0x%llx -> 0x%llx\n", ri->groups,
+                                 tctxt->groups);
+                }
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
+            L_PROGRAM_CASE(CHECK_LONG_LIT) {
+                const char nocase = 0;
+                if (!roseCheckLongLiteral(t, scratch, end, ri->lit_offset,
+                                          ri->lit_length, nocase)) {
+                    DEBUG_PRINTF("failed long lit check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    L_PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
+            L_PROGRAM_CASE(CHECK_LONG_LIT_NOCASE) {
+                const char nocase = 1;
+                if (!roseCheckLongLiteral(t, scratch, end, ri->lit_offset,
+                                          ri->lit_length, nocase)) {
+                    DEBUG_PRINTF("failed nocase long lit check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    L_PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
+            L_PROGRAM_CASE(CHECK_MED_LIT) {
+                const char nocase = 0;
+                if (!roseCheckMediumLiteral(t, scratch, end, ri->lit_offset,
+                                            ri->lit_length, nocase)) {
+                    DEBUG_PRINTF("failed lit check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    L_PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
+            L_PROGRAM_CASE(CHECK_MED_LIT_NOCASE) {
+                const char nocase = 1;
+                if (!roseCheckMediumLiteral(t, scratch, end, ri->lit_offset,
+                                            ri->lit_length, nocase)) {
+                    DEBUG_PRINTF("failed long lit check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    L_PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
+            L_PROGRAM_CASE(CLEAR_WORK_DONE) {
+                DEBUG_PRINTF("clear work_done flag\n");
+                work_done = 0;
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
+            L_PROGRAM_CASE(INCLUDED_JUMP) {
+                if (scratch->fdr_conf) {
+                    // squash the bucket of included literal
+                    u8 shift = scratch->fdr_conf_offset & ~7U;
+                    u64a mask = ((~(u64a)ri->squash) << shift);
+                    *(scratch->fdr_conf) &= mask;
+
+                    pc = getByOffset(t, ri->child_offset);
+                    pc_base = pc;
+                    programOffset = (const u8 *)pc_base -(const u8 *)t;
+                    DEBUG_PRINTF("pc_base %p pc %p child_offset %u squash %u\n",
+                                 pc_base, pc, ri->child_offset, ri->squash);
+                    work_done = 0;
+                    L_PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
+            L_PROGRAM_CASE(SET_LOGICAL) {
+                DEBUG_PRINTF("set logical value of lkey %u, offset_adjust=%d\n",
+                             ri->lkey, ri->offset_adjust);
+                assert(ri->lkey != INVALID_LKEY);
+                assert(ri->lkey < t->lkeyCount);
+                char *lvec = scratch->core_info.logicalVector;
+                setLogicalVal(t, lvec, ri->lkey, 1);
+                updateLastCombMatchOffset(tctxt, end + ri->offset_adjust);
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
+            L_PROGRAM_CASE(SET_COMBINATION) {
+                DEBUG_PRINTF("set ckey %u as active\n", ri->ckey);
+                assert(ri->ckey != INVALID_CKEY);
+                assert(ri->ckey < t->ckeyCount);
+                char *cvec = scratch->core_info.combVector;
+                setCombinationActive(t, cvec, ri->ckey);
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
+            L_PROGRAM_CASE(FLUSH_COMBINATION) {
+                assert(end >= tctxt->lastCombMatchOffset);
+                if (end > tctxt->lastCombMatchOffset) {
+                    if (flushActiveCombinations(t, scratch)
+                            == HWLM_TERMINATE_MATCHING) {
+                        return HWLM_TERMINATE_MATCHING;
+                    }
+                }
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
+            L_PROGRAM_CASE(SET_EXHAUST) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                if (roseSetExhaust(t, scratch, ri->ekey)
+                        == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                work_done = 1;
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
+            L_PROGRAM_CASE(LAST_FLUSH_COMBINATION) {
+                assert(end >= tctxt->lastCombMatchOffset);
+                if (flushActiveCombinations(t, scratch)
+                        == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                if (checkPurelyNegatives(t, scratch, end)
+                        == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
+            default: {
+                assert(0); // unreachable
+                scratch->core_info.status |= STATUS_ERROR;
+                return HWLM_TERMINATE_MATCHING;
+            }
+        }
+    }
+
+    assert(0); // unreachable
+    return HWLM_CONTINUE_MATCHING;
+}
+
+#undef L_PROGRAM_CASE
+#undef L_PROGRAM_NEXT_INSTRUCTION
+#undef L_PROGRAM_NEXT_INSTRUCTION_JUMP
+
+#undef PROGRAM_CASE
+#undef PROGRAM_NEXT_INSTRUCTION
+#undef PROGRAM_NEXT_INSTRUCTION_JUMP
diff --git a/regex/rose/program_runtime.h b/regex/rose/program_runtime.h
new file mode 100644
index 000000000..50bf202c6
--- /dev/null
+++ b/regex/rose/program_runtime.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2015-2019, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file
+ * \brief Rose runtime: program interpreter.
+ */
+
+#ifndef PROGRAM_RUNTIME_H
+#define PROGRAM_RUNTIME_H
+
+#include "hwlm/hwlm.h" // for hwlmcb_rv_t
+#include "rose.h"
+#include "scratch.h"
+#include "ue2common.h"
+
+/*
+ * Program context flags, which control the behaviour of some instructions at
+ * based on runtime contexts (whether the program is triggered by the anchored
+ * matcher, engine catchup, etc).
+ */
+
+#define ROSE_PROG_FLAG_IN_ANCHORED          1
+#define ROSE_PROG_FLAG_IN_CATCHUP           2
+#define ROSE_PROG_FLAG_FROM_MPV             4
+#define ROSE_PROG_FLAG_SKIP_MPV_CATCHUP     8
+
+hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t,
+                           struct hs_scratch *scratch, u32 programOffset,
+                           u64a som, u64a end, u8 prog_flags);
+
+hwlmcb_rv_t roseRunProgram_l(const struct RoseEngine *t,
+                             struct hs_scratch *scratch, u32 programOffset,
+                             u64a som, u64a end, u8 prog_flags);
+
+#endif // PROGRAM_RUNTIME_H
diff --git a/regex/rose/rose.h b/regex/rose/rose.h
new file mode 100644
index 000000000..409b70028
--- /dev/null
+++ b/regex/rose/rose.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2015-2019, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef ROSE_H
+#define ROSE_H
+
+#include "ue2common.h"
+
+struct RoseEngine;
+struct hs_scratch;
+
+// Initialise state space for engine use.
+void roseInitState(const struct RoseEngine *t, char *state);
+
+/* assumes core_info in scratch has been init to point to data */
+void roseBlockExec(const struct RoseEngine *t, struct hs_scratch *scratch);
+
+/* assumes core_info in scratch has been init to point to data */
+void roseStreamExec(const struct RoseEngine *t, struct hs_scratch *scratch);
+
+void roseStreamEodExec(const struct RoseEngine *t, u64a offset,
+                       struct hs_scratch *scratch);
+
+hwlmcb_rv_t roseCallback(size_t end, u32 id, struct hs_scratch *scratch);
+
+int roseReportAdaptor(u64a start, u64a end, ReportID id, void *context);
+
+int roseRunBoundaryProgram(const struct RoseEngine *rose, u32 program,
+                           u64a stream_offset, struct hs_scratch *scratch);
+
+int roseRunFlushCombProgram(const struct RoseEngine *rose,
+                            struct hs_scratch *scratch, u64a end);
+
+int roseRunLastFlushCombProgram(const struct RoseEngine *rose,
+                                struct hs_scratch *scratch, u64a end);
+
+#endif // ROSE_H
diff --git a/regex/rose/rose_common.h b/regex/rose/rose_common.h
new file mode 100644
index 000000000..34678b8fc
--- /dev/null
+++ b/regex/rose/rose_common.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef ROSE_COMMON_H
+#define ROSE_COMMON_H
+
+// Common defs available to build-time clients as well as runtime.
+
+#define ROSE_BOUND_INF (~0U)
+#define MAX_MASK2_WIDTH 32
+
+// Max block width to use the combined small-block matcher on, instead of
+// running the floating and anchored tables.
+#define ROSE_SMALL_BLOCK_LEN 32
+
+/** \brief Length in bytes of a reach bitvector, used by the lookaround code. */
+#define REACH_BITVECTOR_LEN 32
+
+/** \brief Length in bytes of a reach bitvector for multi-path lookaround. */
+#define MULTI_REACH_BITVECTOR_LEN 256
+
+/**
+ * \brief The max offset from the leftmost byte to the rightmost byte in
+ * multi-path lookaround.
+ */
+#define MULTIPATH_MAX_LEN 16
+
+/** \brief Value used to represent an invalid Rose program offset. */
+#define ROSE_INVALID_PROG_OFFSET 0
+
+#endif // ROSE_COMMON_H
diff --git a/regex/rose/rose_internal.h b/regex/rose/rose_internal.h
new file mode 100644
index 000000000..f84c7a080
--- /dev/null
+++ b/regex/rose/rose_internal.h
@@ -0,0 +1,659 @@
+/*
+ * Copyright (c) 2015-2019, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Rose data structures.
+ */
+
+#ifndef ROSE_INTERNAL_H
+#define ROSE_INTERNAL_H
+
+#include "ue2common.h"
+#include "rose_common.h"
+#include "util/scatter.h"
+
+#define ROSE_OFFSET_INVALID          0xffffffff
+
+// Group constants
+typedef u64a rose_group;
+
+// Delayed literal stuff
+#define DELAY_BITS                  5
+#define DELAY_SLOT_COUNT            (1U << DELAY_BITS)
+#define MAX_DELAY                   (DELAY_SLOT_COUNT - 1)
+#define DELAY_MASK                  (DELAY_SLOT_COUNT - 1)
+
+/* Allocation of Rose literal ids
+ *
+ * The rose literal id space is segmented:
+ *
+ * ---- 0
+ * |  | 'Normal' undelayed literals in either e or f tables
+ * |  |
+ * |  |
+ * |  |
+ * ---- anchored_base_id
+ * |  | literals from the a table
+ * |  |
+ * ---- delay_base_id
+ * |  | Delayed version of normal literals
+ * |  |
+ * ---- literalCount
+ */
+
+/* Rose Literal Sources
+ *
+ * Rose currently gets events (mainly roseProcessMatch calls) from a number of
+ * sources:
+ * 1) The floating table
+ * 2) The anchored table
+ * 3) Delayed literals
+ * 4) Suffix NFAs
+ * 5) Literal masks
+ * 5) End anchored table
+ * 6) Prefix / Infix nfas
+ *
+ * Care is required to ensure that events appear to come into Rose in order
+ * (or sufficiently ordered for Rose to cope). Generally the progress of the
+ * floating table is considered the canonical position in the buffer.
+ *
+ * Anchored table:
+ * The anchored table is run before the floating table as nothing in it can
+ * depend on a floating literal. Order is achieved by two measures:
+ * a) user matches^1 are logged and held until the floating matcher passes that
+ *    point;
+ * b) any floating role with an anchored predecessor has a history relationship
+ *    to enforce the ordering.
+ *
+ * Delayed literals:
+ * Delayed literal ordering is handled by delivering any pending delayed
+ * literals before processing any floating match.
+ *
+ * Suffix:
+ * Suffixes are always pure terminal roles. Prior to raising a match^2, pending
+ * NFA queues are run to the current point (floating or delayed literal) as
+ * appropriate.
+ *
+ * Literal Masks:
+ * These are triggered from either floating literals or delayed literals and
+ * inspect the data behind them. Matches are raised at the same location as the
+ * trigger literal so there are no ordering issues. Masks are always pure
+ * terminal roles.
+ *
+ * Lookaround:
+ * These are tests run on receipt of a role that "look around" the match,
+ * checking characters at nearby offsets against reachability masks. Each role
+ * can have a list of these lookaround offset/reach pairs, ordered in offset
+ * order, and any failure will prevent the role from being switched on. Offsets
+ * are relative to the byte after a literal match, and can be negative.
+ *
+ * Prefix / Infix:
+ * TODO: remember / discuss
+ *
+ * End anchored table:
+ * All user matches occur at the last byte. We do this last, so no problems
+ * (yippee)
+ *
+ * ^1 User matches which occur before any possible match from the other tables
+ *    are not delayed.
+ * ^2 Queues may also be run to the current location if a queue is full and
+ *    needs to be emptied.
+ * ^3 There is no need to catch up at the end of a block scan as it contains no
+ *    terminals.
+ */
+
+struct RoseCountingMiracle {
+    char shufti; /** 1: count shufti class; 0: count a single character */
+    u8 count; /** minimum number of occurrences for the counting
+               * miracle char to kill the leftfix. */
+    u8 c; /** character to look for if not shufti */
+    u8 poison; /** character not in the shufti mask */
+    m128 lo; /** shufti lo mask */
+    m128 hi; /** shufti hi mask */
+};
+
+struct LeftNfaInfo {
+    u32 maxQueueLen;
+    u32 maxLag; // maximum of successor roles' lag
+    u32 lagIndex; // iff lag != 0, index into leftfixLagTable
+    u32 stopTable; // stop table index, or ROSE_OFFSET_INVALID
+    u8 transient; /**< 0 if not transient, else max width of transient prefix */
+    char infix; /* TODO: make flags */
+    char eager; /**< nfa should be run eagerly to first match or death */
+    char eod_check; /**< nfa is used by the event eod literal */
+    u32 countingMiracleOffset; /** if not 0, offset to RoseCountingMiracle. */
+    rose_group squash_mask; /* & mask applied when rose nfa dies */
+};
+
+struct NfaInfo {
+    u32 nfaOffset;
+    u32 stateOffset;
+    u32 fullStateOffset; /* offset in scratch, relative to ??? */
+    u32 ekeyListOffset; /* suffix, relative to base of rose, 0 if no ekeys */
+    u8 no_retrigger; /* TODO */
+    u8 in_sbmatcher;  /**< this outfix should not be run in small-block
+                       * execution, as it will be handled by the sbmatcher
+                       * HWLM table. */
+    u8 eod; /* suffix is triggered by the etable --> can only produce eod
+             * matches */
+};
+
+#define MAX_STORED_LEFTFIX_LAG 127 /* max leftfix lag that we can store in one
+                                    * whole byte (OWB) (streaming only). Other
+                                    * values in OWB are reserved for zombie
+                                    * status */
+#define OWB_ZOMBIE_ALWAYS_YES 128 /* nfa will always answer yes to any rose
+                                   * prefix checks */
+
+/* offset of the status flags in the stream state. */
+#define ROSE_STATE_OFFSET_STATUS_FLAGS 0
+
+/* offset of role mmbit in stream state (just after the status flag byte). */
+#define ROSE_STATE_OFFSET_ROLE_MMBIT   sizeof(u8)
+
+/**
+ * \brief Rose state offsets.
+ *
+ * Stores pre-calculated offsets (in bytes) to MOST of the state structures
+ * used by Rose, relative to the start of stream state.
+ *
+ * State not covered by this structure includes:
+ *
+ * -# the first byte, containing the status bitmask
+ * -# the role state multibit
+ */
+struct RoseStateOffsets {
+    /** History buffer.
+     *
+     * Max size of history is RoseEngine::historyRequired. */
+    u32 history;
+
+    /** Exhausted multibit.
+     *
+     * entry per exhaustible key (used by Highlander mode). If a bit is set,
+     * reports with that ekey should not be delivered to the user. */
+    u32 exhausted;
+
+    /** size in bytes of exhausted multibit */
+    u32 exhausted_size;
+
+    /** Logical multibit.
+     *
+     * entry per logical key(operand/operator) (used by Logical Combination). */
+    u32 logicalVec;
+
+    /** size in bytes of logical multibit */
+    u32 logicalVec_size;
+
+    /** Combination multibit.
+     *
+     * entry per combination key (used by Logical Combination). */
+    u32 combVec;
+
+    /** size in bytes of combination multibit */
+    u32 combVec_size;
+
+    /** Multibit for active suffix/outfix engines. */
+    u32 activeLeafArray;
+
+    /** Size of multibit for active suffix/outfix engines in bytes. */
+    u32 activeLeafArray_size;
+
+    /** Multibit for active leftfix (prefix/infix) engines. */
+    u32 activeLeftArray;
+
+    /** Size of multibit for active leftfix (prefix/infix) engines in bytes. */
+    u32 activeLeftArray_size;
+
+    /** Table of lag information (stored as one byte per engine) for active
+     * Rose leftfix engines. */
+    u32 leftfixLagTable;
+
+    /** State for anchored matchers (McClellan DFAs). */
+    u32 anchorState;
+
+    /** Packed Rose groups value. */
+    u32 groups;
+
+    /** Size of packed Rose groups value, in bytes. */
+    u32 groups_size;
+
+    /** State for long literal support. */
+    u32 longLitState;
+
+    /** Size of the long literal state. */
+    u32 longLitState_size;
+
+    /** Packed SOM location slots. */
+    u32 somLocation;
+
+    /** Multibit guarding SOM location slots. */
+    u32 somValid;
+
+    /** Multibit guarding SOM location slots. */
+    u32 somWritable;
+
+    /** Size of each of the somValid and somWritable multibits, in bytes. */
+    u32 somMultibit_size;
+
+    /** Begin of the region where NFA engine state is stored.
+     * The NFA state region extends to end. */
+    u32 nfaStateBegin;
+
+    /** Total size of Rose state, in bytes. */
+    u32 end;
+};
+
+struct RoseBoundaryReports {
+    /** \brief 0 if no reports list, otherwise offset of program to run to
+     * deliver reports at EOD. */
+    u32 reportEodOffset;
+
+    /** \brief 0 if no reports list, otherwise offset of program to run to
+     * deliver reports at offset 0. */
+    u32 reportZeroOffset;
+
+    /** \brief 0 if no reports list, otherwise offset of program to run to
+     * deliver reports if EOD is at offset 0. Superset of other programs. */
+    u32 reportZeroEodOffset;
+};
+
+/* NFA Queue Assignment
+ *
+ * --- 0
+ * (|) chained mpv (if present)
+ *  #
+ * --- outfixBeginQueue -
+ *  | outfixes. enabled at offset 0.
+ *  |
+ *  #
+ * --- outfixEndQueue -
+ *  | suffixes. enabled by rose roles.
+ *  |
+ *  #
+ * --- leftfixBeginQueue -
+ *  | prefixes
+ *  |
+ *  #
+ * --- ?
+ *  | infixes
+ *  |
+ *  #
+ */
+
+#define ROSE_RUNTIME_FULL_ROSE     0
+#define ROSE_RUNTIME_PURE_LITERAL  1
+#define ROSE_RUNTIME_SINGLE_OUTFIX 2
+
+/**
+ * \brief Runtime structure header for Rose.
+ *
+ * Runtime structure header for Rose.
+ * In memory, we follow this with:
+ *   -# the "engine blob"
+ *   -# anchored 'literal' matcher table
+ *   -# floating literal matcher table
+ *   -# eod-anchored literal matcher table
+ *   -# small block table
+ *   -# array of NFA offsets, one per queue
+ *   -# array of state offsets, one per queue (+)
+ *
+ *  (+) stateOffset array note: Offsets in the array are either into the stream
+ *  state (normal case) or into the tstate region of scratch (for transient rose
+ *  nfas). Rose nfa info table can distinguish the cases.
+ */
+struct RoseEngine {
+    u8  pureLiteral; /* Indicator of pure literal API */
+    u8  noFloatingRoots; /* only need to run the anchored table if something
+                          * matched in the anchored table */
+    u8  requiresEodCheck; /* stuff happens at eod time */
+    u8  hasOutfixesInSmallBlock; /**< has at least one outfix that must run even
+                                    in small block scans. */
+    u8  runtimeImpl; /**< can we just run the floating table or a single outfix?
+                      * or do we need a full rose? */
+    u8  mpvTriggeredByLeaf; /**< need to check (suf|out)fixes for mpv trigger */
+    u8  canExhaust; /**< every pattern has an exhaustion key */
+    u8  hasSom; /**< has at least one pattern which tracks SOM. */
+    u8  somHorizon; /**< width in bytes of SOM offset storage (governed by
+                        SOM precision) */
+    u32 mode; /**< scanning mode, one of HS_MODE_{BLOCK,STREAM,VECTORED} */
+    u32 historyRequired; /**< max amount of history required for streaming */
+    u32 ekeyCount; /**< number of exhaustion keys */
+    u32 lkeyCount; /**< number of logical keys */
+    u32 lopCount; /**< number of logical ops */
+    u32 ckeyCount; /**< number of combination keys */
+    u32 logicalTreeOffset; /**< offset to mapping from lkey to LogicalOp */
+    u32 combInfoMapOffset; /**< offset to mapping from ckey to combInfo */
+    u32 dkeyCount; /**< number of dedupe keys */
+    u32 dkeyLogSize; /**< size of fatbit for storing dkey log (bytes) */
+    u32 invDkeyOffset; /**< offset to table mapping from dkeys to the external
+                         *  report ids */
+    u32 somLocationCount; /**< number of som locations required */
+    u32 somLocationFatbitSize; /**< size of SOM location fatbit (bytes) */
+    u32 rolesWithStateCount; // number of roles with entries in state bitset
+    u32 stateSize; /* size of the state bitset
+                    * WARNING: not the size of the rose state */
+    u32 anchorStateSize; /* size of the state for the anchor dfas */
+    u32 tStateSize; /* total size of the state for transient rose nfas */
+    u32 scratchStateSize; /**< uncompressed state req'd for NFAs in scratch;
+                           * used for sizing scratch only. */
+    u32 smallWriteOffset; /**< offset of small-write matcher */
+    u32 amatcherOffset; // offset of the anchored literal matcher (bytes)
+    u32 ematcherOffset; // offset of the eod-anchored literal matcher (bytes)
+    u32 fmatcherOffset; // offset of the floating literal matcher (bytes)
+    u32 drmatcherOffset; // offset of the delayed rebuild table (bytes)
+    u32 sbmatcherOffset; // offset of the small-block literal matcher (bytes)
+    u32 longLitTableOffset; // offset of the long literal table
+    u32 amatcherMinWidth; /**< minimum number of bytes required for a pattern
+                           * involved with the anchored table to produce a full
+                           * match. */
+    u32 fmatcherMinWidth; /**< minimum number of bytes required for a pattern
+                           * involved with the floating table to produce a full
+                           * match. */
+    u32 eodmatcherMinWidth; /**< minimum number of bytes required for a pattern
+                               * involved with the eod table to produce a full
+                               * match. */
+    u32 amatcherMaxBiAnchoredWidth; /**< maximum number of bytes that can still
+                                     * produce a match for a pattern involved
+                                     * with the anchored table. */
+    u32 fmatcherMaxBiAnchoredWidth; /**< maximum number of bytes that can still
+                                     * produce a match for a pattern involved
+                                     * with the anchored table. */
+
+    /**
+     * \brief Offset of u32 array of program offsets for reports used by
+     * output-exposed engines.
+     */
+    u32 reportProgramOffset;
+
+    /**
+     * \brief Number of programs for reports used by output-exposed engines.
+     */
+    u32 reportProgramCount;
+
+    /**
+     * \brief Offset of u32 array of program offsets for delayed replay of
+     * literals.
+     */
+    u32 delayProgramOffset;
+
+    /**
+     * \brief Offset of u32 array of program offsets for anchored literals.
+     */
+    u32 anchoredProgramOffset;
+
+    u32 activeArrayCount; //number of nfas tracked in the active array
+    u32 activeLeftCount; //number of nfas tracked in the active rose array
+    u32 queueCount;      /**< number of nfa queues */
+    u32 activeQueueArraySize; //!< size of fatbit for active queues (bytes)
+
+    u32 eagerIterOffset; /**< offset to sparse iter for eager prefixes or 0 if
+                          * none */
+
+    /** \brief Number of keys used by CHECK_SET_HANDLED instructions in role
+     * programs. */
+    u32 handledKeyCount;
+
+    /** \brief Size of the handled keys fatbit in scratch (bytes). */
+    u32 handledKeyFatbitSize;
+
+    u32 leftOffset;
+    u32 roseCount;
+
+    u32 eodProgramOffset; //!< EOD program, otherwise 0.
+    u32 flushCombProgramOffset; /**< FlushCombination program, otherwise 0 */
+    u32 lastFlushCombProgramOffset; /**< LastFlushCombination program,
+                                     * otherwise 0 */
+
+    u32 lastByteHistoryIterOffset; // if non-zero
+
+    /** \brief Minimum number of bytes required to match. */
+    u32 minWidth;
+
+    /** \brief Minimum number of bytes required to match, excluding boundary
+     * reports. */
+    u32 minWidthExcludingBoundaries;
+
+    u32 maxBiAnchoredWidth; /* ROSE_BOUND_INF if any non bianchored patterns
+                             * present */
+    u32 anchoredDistance; // region to run the anchored table over
+    u32 anchoredMinDistance; /* start of region to run anchored table over */
+    u32 floatingDistance; /* end of region to run the floating table over
+                             ROSE_BOUND_INF if not bounded */
+    u32 floatingMinDistance; /* start of region to run floating table over */
+    u32 smallBlockDistance; /* end of region to run the floating table over
+                               ROSE_BOUND_INF if not bounded */
+    u32 floatingMinLiteralMatchOffset; /* the minimum offset that we can get a
+                                        * 'valid' match from the floating
+                                        * table */
+    u32 nfaInfoOffset; /* offset to the nfa info offset array */
+    rose_group initialGroups;
+    rose_group floating_group_mask; /* groups that are used by the ftable */
+    u32 size; // (bytes)
+    u32 delay_count; /* number of delayed literal ids. */
+    u32 delay_fatbit_size; //!< size of each delay fatbit in scratch (bytes)
+    u32 anchored_count; /* number of anchored literal ids */
+    u32 anchored_fatbit_size; //!< size of each anch fatbit in scratch (bytes)
+    u32 maxFloatingDelayedMatch; /* max offset that a delayed literal can
+                                  * usefully be reported */
+    u32 delayRebuildLength; /* length of the history region which needs to be
+                             * rescanned when we are doing a delayed literal
+                             * rebuild scan. */
+    struct RoseStateOffsets stateOffsets;
+    struct RoseBoundaryReports boundary;
+    u32 totalNumLiterals; /* total number of literals including dr */
+    u32 asize; /* size of the atable */
+    u32 outfixBeginQueue; /* first outfix queue */
+    u32 outfixEndQueue; /* one past the last outfix queue */
+    u32 leftfixBeginQueue; /* first prefix/infix queue */
+    u32 initMpvNfa; /* (allegedly chained) mpv to force on at init */
+    u32 rosePrefixCount; /* number of rose prefixes */
+    u32 activeLeftIterOffset; /* mmbit_sparse_iter over non-transient roses */
+    u32 ematcherRegionSize; /* max region size to pass to ematcher */
+    u32 somRevCount; /**< number of som reverse nfas */
+    u32 somRevOffsetOffset; /**< offset to array of offsets to som rev nfas */
+    u32 longLitStreamState; // size in bytes
+
+    struct scatter_full_plan state_init;
+};
+
+struct ALIGN_CL_DIRECTIVE anchored_matcher_info {
+    u32 next_offset; /* relative to this, 0 for end */
+    u32 state_offset; /* relative to anchorState */
+    u32 anchoredMinDistance; /* start of region to run anchored table over */
+};
+
+/**
+ * \brief Long literal subtable for a particular mode (caseful or nocase).
+ */
+struct RoseLongLitSubtable {
+    /**
+     * \brief Offset of the hash table (relative to RoseLongLitTable base).
+     *
+     * Offset is zero if no such table exists.
+     */
+    u32 hashOffset;
+
+    /**
+     * \brief Offset of the bloom filter (relative to RoseLongLitTable base).
+     *
+     * Offset is zero if no such table exists.
+     */
+    u32 bloomOffset;
+
+    /** \brief lg2 of the size of the hash table. */
+    u8 hashBits;
+
+    /** \brief Size of the bloom filter in bits. */
+    u8 bloomBits;
+
+    /** \brief Number of bits of packed stream state used.  */
+    u8 streamStateBits;
+};
+
+/**
+ * \brief Long literal table header.
+ */
+struct RoseLongLitTable {
+    /**
+     * \brief Total size of the whole table (including strings, bloom filters,
+     * hash tables).
+     */
+    u32 size;
+
+    /** \brief Caseful sub-table (hash table and bloom filter). */
+    struct RoseLongLitSubtable caseful;
+
+    /** \brief Caseless sub-table (hash table and bloom filter). */
+    struct RoseLongLitSubtable nocase;
+
+    /** \brief Total size of packed stream state in bytes. */
+    u8 streamStateBytes;
+
+    /** \brief Max length of literal prefixes. */
+    u8 maxLen;
+};
+
+/**
+ * \brief One of these structures per hash table entry in our long literal
+ * table.
+ */
+struct RoseLongLitHashEntry {
+    /**
+     * \brief Offset of the literal string itself, relative to
+     * RoseLongLitTable base. Zero if this bucket is empty.
+     */
+    u32 str_offset;
+
+    /** \brief Length of the literal string. */
+    u32 str_len;
+};
+
+static really_inline
+const struct anchored_matcher_info *getALiteralMatcher(
+        const struct RoseEngine *t) {
+    if (!t->amatcherOffset) {
+        return NULL;
+    }
+
+    const char *lt = (const char *)t + t->amatcherOffset;
+    assert(ISALIGNED_CL(lt));
+    return (const struct anchored_matcher_info *)lt;
+}
+
+struct HWLM;
+
+static really_inline
+const struct HWLM *getFLiteralMatcher(const struct RoseEngine *t) {
+    if (!t->fmatcherOffset) {
+        return NULL;
+    }
+
+    const char *lt = (const char *)t + t->fmatcherOffset;
+    assert(ISALIGNED_CL(lt));  
+    return (const struct HWLM *)lt;
+}
+
+static really_inline
+const void *getSBLiteralMatcher(const struct RoseEngine *t) {
+    if (!t->sbmatcherOffset) {
+        return NULL;
+    }
+
+    const char *matcher = (const char *)t + t->sbmatcherOffset;
+    assert(ISALIGNED_N(matcher, 8));
+    return matcher;
+}
+
+static really_inline
+const struct LeftNfaInfo *getLeftTable(const struct RoseEngine *t) {
+    const struct LeftNfaInfo *r
+        = (const struct LeftNfaInfo *)((const char *)t + t->leftOffset);
+    assert(ISALIGNED_N(r, 4));
+    return r;
+}
+
+struct mmbit_sparse_iter; // forward decl
+
+static really_inline
+const struct mmbit_sparse_iter *getActiveLeftIter(const struct RoseEngine *t) {
+    assert(t->activeLeftIterOffset);
+    const struct mmbit_sparse_iter *it = (const struct mmbit_sparse_iter *)
+            ((const char *)t + t->activeLeftIterOffset);
+    assert(ISALIGNED_N(it, 4));
+    return it;
+}
+
+static really_inline
+const struct NfaInfo *getNfaInfoByQueue(const struct RoseEngine *t, u32 qi) {
+    const struct NfaInfo *infos
+        = (const struct NfaInfo *)((const char *)t + t->nfaInfoOffset);
+    assert(ISALIGNED_N(infos, sizeof(u32)));
+
+    return &infos[qi];
+}
+
+static really_inline
+const struct NFA *getNfaByInfo(const struct RoseEngine *t,
+                               const struct NfaInfo *info) {
+    return (const struct NFA *)((const char *)t + info->nfaOffset);
+}
+
+static really_inline
+const struct NFA *getNfaByQueue(const struct RoseEngine *t, u32 qi) {
+    const struct NfaInfo *info = getNfaInfoByQueue(t, qi);
+    return getNfaByInfo(t, info);
+}
+
+static really_inline
+u32 queueToLeftIndex(const struct RoseEngine *t, u32 qi) {
+    assert(qi >= t->leftfixBeginQueue);
+    return qi - t->leftfixBeginQueue;
+}
+
+static really_inline
+const struct LeftNfaInfo *getLeftInfoByQueue(const struct RoseEngine *t,
+                                             u32 qi) {
+    const struct LeftNfaInfo *infos = getLeftTable(t);
+    return &infos[queueToLeftIndex(t, qi)];
+}
+
+struct SmallWriteEngine;
+
+static really_inline
+const struct SmallWriteEngine *getSmallWrite(const struct RoseEngine *t) {
+    if (!t->smallWriteOffset) {
+        return NULL;
+    }
+
+    const struct SmallWriteEngine *smwr =
+        (const struct SmallWriteEngine *)((const char *)t + t->smallWriteOffset);
+    return smwr;
+}
+
+#endif // ROSE_INTERNAL_H
diff --git a/regex/rose/rose_program.h b/regex/rose/rose_program.h
new file mode 100644
index 000000000..7e21303cb
--- /dev/null
+++ b/regex/rose/rose_program.h
@@ -0,0 +1,724 @@
+/*
+ * Copyright (c) 2015-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Rose data structures to do with role programs.
+ */
+
+#ifndef ROSE_ROSE_PROGRAM_H
+#define ROSE_ROSE_PROGRAM_H
+
+#include "som/som_operation.h"
+#include "rose_internal.h"
+#include "ue2common.h"
+#include "util/simd_types.h"
+
+/** \brief Minimum alignment for each instruction in memory. */
+#define ROSE_INSTR_MIN_ALIGN 8U
+
+/** \brief Role program instruction opcodes. */
+enum RoseInstructionCode {
+    ROSE_INSTR_END,               //!< End of program.
+    ROSE_INSTR_ANCHORED_DELAY,    //!< Delay until after anchored matcher.
+    ROSE_INSTR_CHECK_LIT_EARLY,   //!< Skip matches before floating min offset.
+    ROSE_INSTR_CHECK_GROUPS,      //!< Check that literal groups are on.
+    ROSE_INSTR_CHECK_ONLY_EOD,    //!< Role matches only at EOD.
+    ROSE_INSTR_CHECK_BOUNDS,      //!< Bounds on distance from offset 0.
+    ROSE_INSTR_CHECK_NOT_HANDLED, //!< Test & set role in "handled".
+    ROSE_INSTR_CHECK_SINGLE_LOOKAROUND, //!< Single lookaround check.
+    ROSE_INSTR_CHECK_LOOKAROUND,  //!< Lookaround check.
+    ROSE_INSTR_CHECK_MASK,        //!< 8-bytes mask check.
+    ROSE_INSTR_CHECK_MASK_32,     //!< 32-bytes and/cmp/neg mask check.
+    ROSE_INSTR_CHECK_BYTE,        //!< Single Byte check.
+    ROSE_INSTR_CHECK_SHUFTI_16x8, //!< Check 16-byte data by 8-bucket shufti.
+    ROSE_INSTR_CHECK_SHUFTI_32x8, //!< Check 32-byte data by 8-bucket shufti.
+    ROSE_INSTR_CHECK_SHUFTI_16x16, //!< Check 16-byte data by 16-bucket shufti.
+    ROSE_INSTR_CHECK_SHUFTI_32x16, //!< Check 32-byte data by 16-bucket shufti.
+    ROSE_INSTR_CHECK_INFIX,       //!< Infix engine must be in accept state.
+    ROSE_INSTR_CHECK_PREFIX,      //!< Prefix engine must be in accept state.
+    ROSE_INSTR_PUSH_DELAYED,      //!< Push delayed literal matches.
+    ROSE_INSTR_DUMMY_NOP,         //!< NOP. Should not exist in build programs.
+    ROSE_INSTR_CATCH_UP,          //!< Catch up engines, anchored matches.
+    ROSE_INSTR_CATCH_UP_MPV,      //!< Catch up the MPV.
+    ROSE_INSTR_SOM_ADJUST,        //!< Set SOM from a distance to EOM.
+    ROSE_INSTR_SOM_LEFTFIX,       //!< Acquire SOM from a leftfix engine.
+    ROSE_INSTR_SOM_FROM_REPORT,   //!< Acquire SOM from a som_operation.
+    ROSE_INSTR_SOM_ZERO,          //!< Set SOM to zero.
+    ROSE_INSTR_TRIGGER_INFIX,     //!< Trigger an infix engine.
+    ROSE_INSTR_TRIGGER_SUFFIX,    //!< Trigger a suffix engine.
+    ROSE_INSTR_DEDUPE,            //!< Run deduplication for report.
+    ROSE_INSTR_DEDUPE_SOM,        //!< Run deduplication for SOM report.
+    ROSE_INSTR_REPORT_CHAIN,      //!< Fire a chained report (MPV).
+    ROSE_INSTR_REPORT_SOM_INT,    //!< Manipulate SOM only.
+    ROSE_INSTR_REPORT_SOM_AWARE,  //!< Manipulate SOM from SOM-aware source.
+
+    /** \brief Fire a report. */
+    ROSE_INSTR_REPORT,
+
+    /** \brief Fire an exhaustible report. */
+    ROSE_INSTR_REPORT_EXHAUST,
+
+    /** \brief Fire a SOM report. */
+    ROSE_INSTR_REPORT_SOM,
+
+    /** \brief Fire an exhaustible SOM report. */
+    ROSE_INSTR_REPORT_SOM_EXHAUST,
+
+    /** \brief Super-instruction combining DEDUPE and REPORT. */
+    ROSE_INSTR_DEDUPE_AND_REPORT,
+
+    /**
+     * \brief Fire a report and stop program execution. This is a
+     * specialisation intended for short, frequently-executed programs.
+     */
+    ROSE_INSTR_FINAL_REPORT,
+
+    ROSE_INSTR_CHECK_EXHAUSTED,   //!< Check if an ekey has already been set.
+    ROSE_INSTR_CHECK_MIN_LENGTH,  //!< Check (EOM - SOM) against min length.
+    ROSE_INSTR_SET_STATE,         //!< Switch a state index on.
+    ROSE_INSTR_SET_GROUPS,        //!< Set some literal group bits.
+    ROSE_INSTR_SQUASH_GROUPS,     //!< Conditionally turn off some groups.
+    ROSE_INSTR_CHECK_STATE,       //!< Test a single bit in the state multibit.
+    ROSE_INSTR_SPARSE_ITER_BEGIN, //!< Begin running a sparse iter over states.
+    ROSE_INSTR_SPARSE_ITER_NEXT,  //!< Continue running sparse iter over states.
+    ROSE_INSTR_SPARSE_ITER_ANY,   //!< Test for any bit in the sparse iterator.
+
+    /** \brief Check outfixes and suffixes for EOD and fire reports if so. */
+    ROSE_INSTR_ENGINES_EOD,
+
+    /** \brief Catch up and check active suffixes for EOD and fire reports if
+     * so. */
+    ROSE_INSTR_SUFFIXES_EOD,
+
+    /** \brief Run the EOD-anchored HWLM literal matcher. */
+    ROSE_INSTR_MATCHER_EOD,
+
+    /**
+     * \brief Confirm a case-sensitive literal at the current offset. In
+     * streaming mode, this makes use of the long literal table.
+     */
+    ROSE_INSTR_CHECK_LONG_LIT,
+
+    /**
+     * \brief Confirm a case-insensitive literal at the current offset. In
+     * streaming mode, this makes use of the long literal table.
+     */
+    ROSE_INSTR_CHECK_LONG_LIT_NOCASE,
+
+    /**
+     * \brief Confirm a case-sensitive "medium length" literal at the current
+     * offset. In streaming mode, this will check history if needed.
+     */
+    ROSE_INSTR_CHECK_MED_LIT,
+
+    /**
+     * \brief Confirm a case-insensitive "medium length" literal at the current
+     * offset. In streaming mode, this will check history if needed.
+     */
+    ROSE_INSTR_CHECK_MED_LIT_NOCASE,
+
+    /**
+     * \brief Clear the "work done" flag used by the SQUASH_GROUPS instruction.
+     */
+    ROSE_INSTR_CLEAR_WORK_DONE,
+
+    /** \brief Check lookaround if it has multiple paths. */
+    ROSE_INSTR_MULTIPATH_LOOKAROUND,
+
+    /**
+     * \brief Use shufti to check lookaround with multiple paths. The total
+     * length of the paths is 16 bytes at most and shufti has 8 buckets.
+     * All paths can be at most 16 bytes long.
+     */
+    ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_16x8,
+
+    /**
+     * \brief Use shufti to check lookaround with multiple paths. The total
+     * length of the paths is 32 bytes at most and shufti has 8 buckets.
+     * All paths can be at most 16 bytes long.
+     */
+    ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_32x8,
+
+    /**
+     * \brief Use shufti to check lookaround with multiple paths. The total
+     * length of the paths is 32 bytes at most and shufti has 16 buckets.
+     * All paths can be at most 16 bytes long.
+     */
+    ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_32x16,
+
+    /**
+     * \brief Use shufti to check multiple paths lookaround. The total
+     * length of the paths is 64 bytes at most and shufti has 8 buckets.
+     * All paths can be at most 16 bytes long.
+     */
+    ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_64,
+
+    /**
+     * \brief Jump to the program of included literal.
+     */
+    ROSE_INSTR_INCLUDED_JUMP,
+
+    /**
+     * \brief Set matching status of a sub-expression.
+     */
+    ROSE_INSTR_SET_LOGICAL,
+
+    /**
+     * \brief Set combination status pending checking.
+     */
+    ROSE_INSTR_SET_COMBINATION,
+
+    /**
+     * \brief Check if compliant with any logical constraints.
+     */
+    ROSE_INSTR_FLUSH_COMBINATION,
+
+    /** \brief Mark as exhausted instead of report while quiet. */
+    ROSE_INSTR_SET_EXHAUST,
+
+    /**
+     * \brief Calculate any combination's logical value if none of its
+     * sub-expression matches until EOD, then check if compliant with any
+     * logical constraints.
+     */
+    ROSE_INSTR_LAST_FLUSH_COMBINATION,
+
+    ROSE_INSTR_CHECK_SHUFTI_64x8, //!< Check 64-byte data by 8-bucket shufti.
+    ROSE_INSTR_CHECK_SHUFTI_64x16, //!< Check 64-byte data by 16-bucket shufti.
+    ROSE_INSTR_CHECK_MASK_64,     //!< 64-bytes and/cmp/neg mask check.
+
+    LAST_ROSE_INSTRUCTION = ROSE_INSTR_CHECK_MASK_64 //!< Sentinel.
+};
+
+struct ROSE_STRUCT_END {
+    u8 code; //!< From enum RoseInstructionCode.
+};
+
+struct ROSE_STRUCT_ANCHORED_DELAY {
+    u8 code; //!< From enum RoseInstructionCode.
+    rose_group groups; //!< Bitmask.
+    u32 anch_id; //!< Program to restart after the delay.
+    u32 done_jump; //!< Jump forward this many bytes if we have to delay.
+};
+
+struct ROSE_STRUCT_CHECK_LIT_EARLY {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 min_offset; //!< Minimum offset for this literal.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+/** Note: check failure will halt program. */
+struct ROSE_STRUCT_CHECK_GROUPS {
+    u8 code; //!< From enum RoseInstructionCode.
+    rose_group groups; //!< Bitmask.
+};
+
+struct ROSE_STRUCT_CHECK_ONLY_EOD {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_BOUNDS {
+    u8 code; //!< From enum RoseInstructionCode.
+    u64a min_bound; //!< Min distance from zero.
+    u64a max_bound; //!< Max distance from zero.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_NOT_HANDLED {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 key; //!< Key in the "handled_roles" fatbit in scratch.
+    u32 fail_jump; //!< Jump forward this many bytes if we have seen key before.
+};
+
+struct ROSE_STRUCT_CHECK_SINGLE_LOOKAROUND {
+    u8 code; //!< From enum RoseInstructionCode.
+    s8 offset; //!< The offset of the byte to examine.
+    u32 reach_index; //!< Index for lookaround reach bitvectors.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_LOOKAROUND {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 look_index; //!< Offset in bytecode of lookaround offset list.
+    u32 reach_index; //!< Offset in bytecode of lookaround reach bitvectors.
+    u32 count; //!< The count of lookaround entries in one instruction.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_MASK {
+    u8 code; //!< From enum roseInstructionCode.
+    u64a and_mask; //!< 8-byte and mask.
+    u64a cmp_mask; //!< 8-byte cmp mask.
+    u64a neg_mask; //!< 8-byte negation mask.
+    s32 offset; //!< Relative offset of the first byte.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_MASK_32 {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 and_mask[32]; //!< 32-byte and mask.
+    u8 cmp_mask[32]; //!< 32-byte cmp mask.
+    u32 neg_mask; //!< negation mask with 32 bits.
+    s32 offset; //!< Relative offset of the first byte.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_MASK_64 {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 and_mask[64]; //!< 64-byte and mask.
+    u8 cmp_mask[64]; //!< 64-byte cmp mask.
+    u64a neg_mask; //!< negation mask with 32 bits.
+    s32 offset; //!< Relative offset of the first byte.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_BYTE {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 and_mask; //!< 8-bits and mask.
+    u8 cmp_mask; //!< 8-bits cmp mask.
+    u8 negation; //!< Flag about negation.
+    s32 offset; //!< The relative offset.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+// Since m128 and m256 could be missaligned in the bytecode,
+// we'll use u8[16] and u8[32] instead in all rose_check_shufti structures.
+struct ROSE_STRUCT_CHECK_SHUFTI_16x8 {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 nib_mask[32]; //!< High 16 and low 16 bits nibble mask in shufti.
+    u8 bucket_select_mask[16]; //!< Mask for bucket assigning.
+    u32 neg_mask; //!< Negation mask in low 16 bits.
+    s32 offset; //!< Relative offset of the first byte.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_SHUFTI_32x8 {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 hi_mask[16]; //!< High nibble mask in shufti.
+    u8 lo_mask[16]; //!< Low nibble mask in shufti.
+    u8 bucket_select_mask[32]; //!< Mask for bucket assigning.
+    u32 neg_mask; //!< 32 bits negation mask.
+    s32 offset; //!< Relative offset of the first byte.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_SHUFTI_16x16 {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 hi_mask[32]; //!< High nibble mask in shufti.
+    u8 lo_mask[32]; //!< Low nibble mask in shufti.
+    u8 bucket_select_mask[32]; //!< Mask for bucket assigning.
+    u32 neg_mask; //!< Negation mask in low 16 bits.
+    s32 offset; //!< Relative offset of the first byte.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_SHUFTI_32x16 {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 hi_mask[32]; //!< High nibble mask in shufti.
+    u8 lo_mask[32]; //!< Low nibble mask in shufti.
+    u8 bucket_select_mask_hi[32]; //!< Bucket mask for high 8 buckets.
+    u8 bucket_select_mask_lo[32]; //!< Bucket mask for low 8 buckets.
+    u32 neg_mask; //!< 32 bits negation mask.
+    s32 offset; //!< Relative offset of the first byte.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_SHUFTI_64x8 {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 hi_mask[64]; //!< High nibble mask in shufti.
+    u8 lo_mask[64]; //!< Low nibble mask in shufti.
+    u8 bucket_select_mask[64]; //!< Mask for bucket assigning.
+    u64a neg_mask; //!< 64 bits negation mask.
+    s32 offset; //!< Relative offset of the first byte.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_SHUFTI_64x16 {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 hi_mask_1[64]; //!< 4 copies of 0-15 High nibble mask.
+    u8 hi_mask_2[64]; //!< 4 copies of 16-32 High nibble mask.
+    u8 lo_mask_1[64]; //!< 4 copies of 0-15 Low nibble mask.
+    u8 lo_mask_2[64]; //!< 4 copies of 16-32 Low nibble mask.
+    u8 bucket_select_mask_hi[64]; //!< Bucket mask for high 8 buckets.
+    u8 bucket_select_mask_lo[64]; //!< Bucket mask for low 8 buckets.
+    u64a neg_mask; //!< 64 bits negation mask.
+    s32 offset; //!< Relative offset of the first byte.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_INFIX {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 queue; //!< Queue of leftfix to check.
+    u32 lag; //!< Lag of leftfix for this case.
+    ReportID report; //!< ReportID of leftfix to check.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_PREFIX {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 queue; //!< Queue of leftfix to check.
+    u32 lag; //!< Lag of leftfix for this case.
+    ReportID report; //!< ReportID of leftfix to check.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_PUSH_DELAYED {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 delay; // Number of bytes to delay.
+    u32 index; // Delay literal index (relative to first delay lit).
+};
+
+struct ROSE_STRUCT_DUMMY_NOP {
+    u8 code; //!< From enum RoseInstructionCode.
+};
+
+struct ROSE_STRUCT_CATCH_UP {
+    u8 code; //!< From enum RoseInstructionCode.
+};
+
+struct ROSE_STRUCT_CATCH_UP_MPV {
+    u8 code; //!< From enum RoseInstructionCode.
+};
+
+struct ROSE_STRUCT_SOM_ADJUST {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 distance; //!< Distance to EOM.
+};
+
+struct ROSE_STRUCT_SOM_LEFTFIX {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 queue; //!< Queue index of leftfix providing SOM.
+    u32 lag; //!< Lag of leftfix for this case.
+};
+
+struct ROSE_STRUCT_SOM_FROM_REPORT {
+    u8 code; //!< From enum RoseInstructionCode.
+    struct som_operation som;
+};
+
+struct ROSE_STRUCT_SOM_ZERO {
+    u8 code; //!< From enum RoseInstructionCode.
+};
+
+struct ROSE_STRUCT_TRIGGER_INFIX {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 cancel; //!< Cancels previous top event.
+    u32 queue; //!< Queue index of infix.
+    u32 event; //!< Queue event, from MQE_*.
+};
+
+struct ROSE_STRUCT_TRIGGER_SUFFIX {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 queue; //!< Queue index of suffix.
+    u32 event; //!< Queue event, from MQE_*.
+};
+
+struct ROSE_STRUCT_DEDUPE {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 quash_som; //!< Force SOM to zero for this report.
+    u32 dkey; //!< Dedupe key.
+    s32 offset_adjust; //!< Offset adjustment to apply to end offset.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_DEDUPE_SOM {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 quash_som; //!< Force SOM to zero for this report.
+    u32 dkey; //!< Dedupe key.
+    s32 offset_adjust; //!< Offset adjustment to apply to end offset.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_REPORT_CHAIN {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 event; //!< Queue event, from MQE_*. Must be a top.
+
+    /**
+     * \brief Number of bytes behind us that we are allowed to squash
+     * identical top events on the queue.
+     */
+    u64a top_squash_distance;
+};
+
+struct ROSE_STRUCT_REPORT_SOM_INT {
+    u8 code; //!< From enum RoseInstructionCode.
+    struct som_operation som;
+};
+
+struct ROSE_STRUCT_REPORT_SOM_AWARE {
+    u8 code; //!< From enum RoseInstructionCode.
+    struct som_operation som;
+};
+
+struct ROSE_STRUCT_REPORT {
+    u8 code; //!< From enum RoseInstructionCode.
+    ReportID onmatch; //!< Report ID to deliver to user.
+    s32 offset_adjust; //!< Offset adjustment to apply to end offset.
+};
+
+struct ROSE_STRUCT_REPORT_EXHAUST {
+    u8 code; //!< From enum RoseInstructionCode.
+    ReportID onmatch; //!< Report ID to deliver to user.
+    s32 offset_adjust; //!< Offset adjustment to apply to end offset.
+    u32 ekey; //!< Exhaustion key.
+};
+
+struct ROSE_STRUCT_REPORT_SOM {
+    u8 code; //!< From enum RoseInstructionCode.
+    ReportID onmatch; //!< Report ID to deliver to user.
+    s32 offset_adjust; //!< Offset adjustment to apply to end offset.
+};
+
+struct ROSE_STRUCT_REPORT_SOM_EXHAUST {
+    u8 code; //!< From enum RoseInstructionCode.
+    ReportID onmatch; //!< Report ID to deliver to user.
+    s32 offset_adjust; //!< Offset adjustment to apply to end offset.
+    u32 ekey; //!< Exhaustion key.
+};
+
+struct ROSE_STRUCT_DEDUPE_AND_REPORT {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 quash_som; //!< Force SOM to zero for this report.
+    u32 dkey; //!< Dedupe key.
+    ReportID onmatch; //!< Report ID to deliver to user.
+    s32 offset_adjust; //!< Offset adjustment to apply to end offset.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_FINAL_REPORT {
+    u8 code; //!< From enum RoseInstructionCode.
+    ReportID onmatch; //!< Report ID to deliver to user.
+    s32 offset_adjust; //!< Offset adjustment to apply to end offset.
+};
+
+struct ROSE_STRUCT_CHECK_EXHAUSTED {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 ekey; //!< Exhaustion key to check.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_MIN_LENGTH {
+    u8 code; //!< From enum RoseInstructionCode.
+    s32 end_adj; //!< Offset adjustment to add to EOM first.
+    u64a min_length; //!< Minimum distance from SOM to EOM.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_SET_STATE {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 index; //!< State index in multibit.
+};
+
+struct ROSE_STRUCT_SET_GROUPS {
+    u8 code; //!< From enum RoseInstructionCode.
+    rose_group groups; //!< Bitmask to OR into groups.
+};
+
+struct ROSE_STRUCT_SQUASH_GROUPS {
+    u8 code; //!< From enum RoseInstructionCode.
+    rose_group groups; //!< Bitmask to AND into groups.
+};
+
+struct ROSE_STRUCT_CHECK_STATE {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 index; //!< State index in the role multibit.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+/**
+ * Note that the offsets in the jump table are always relative to the start of
+ * the program, not the current instruction.
+ */
+struct ROSE_STRUCT_SPARSE_ITER_BEGIN {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 iter_offset; //!< Offset of mmbit_sparse_iter structure.
+    u32 jump_table; //!< Offset of jump table indexed by sparse iterator.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+/**
+ * Note that the offsets in the jump table are always relative to the start of
+ * the program, not the current instruction.
+ */
+struct ROSE_STRUCT_SPARSE_ITER_NEXT {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 iter_offset; //!< Offset of mmbit_sparse_iter structure.
+    u32 jump_table; //!< Offset of jump table indexed by sparse iterator.
+    u32 state; // Current state index.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_SPARSE_ITER_ANY {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 iter_offset; //!< Offset of mmbit_sparse_iter structure.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_ENGINES_EOD {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 iter_offset; //!< Offset of mmbit_sparse_iter structure.
+};
+
+struct ROSE_STRUCT_SUFFIXES_EOD {
+    u8 code; //!< From enum RoseInstructionCode.
+};
+
+struct ROSE_STRUCT_MATCHER_EOD {
+    u8 code; //!< From enum RoseInstructionCode.
+};
+
+struct ROSE_STRUCT_CHECK_LONG_LIT {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 lit_offset; //!< Offset of literal string.
+    u32 lit_length; //!< Length of literal string.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_LONG_LIT_NOCASE {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 lit_offset; //!< Offset of literal string.
+    u32 lit_length; //!< Length of literal string.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_MED_LIT {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 lit_offset; //!< Offset of literal string.
+    u32 lit_length; //!< Length of literal string.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_MED_LIT_NOCASE {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 lit_offset; //!< Offset of literal string.
+    u32 lit_length; //!< Length of literal string.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CLEAR_WORK_DONE {
+    u8 code; //!< From enum RoseInstructionCode.
+};
+
+struct ROSE_STRUCT_MULTIPATH_LOOKAROUND {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 look_index; //!< Offset in bytecode of lookaround offset list.
+    u32 reach_index; //!< Offset in bytecode of lookaround reach bitvectors.
+    u32 count; //!< The lookaround byte numbers for each path.
+    s32 last_start; //!< The latest start offset among 8 paths.
+    u8 start_mask[MULTIPATH_MAX_LEN]; /*!< Used to initialize path if left-most
+                                       * data is missed. */
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_16x8 {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 nib_mask[2 * sizeof(m128)]; //!< High and low nibble mask in shufti.
+    u8 bucket_select_mask[sizeof(m128)]; //!< Mask for bucket assigning.
+    u8 data_select_mask[sizeof(m128)]; //!< Shuffle mask for data ordering.
+    u32 hi_bits_mask; //!< High-bits used in multi-path validation.
+    u32 lo_bits_mask; //!< Low-bits used in multi-path validation.
+    u32 neg_mask; //!< 64 bits negation mask.
+    s32 base_offset; //!< Relative offset of the first byte.
+    s32 last_start; //!< The latest start offset among 8 paths.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_32x8 {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 hi_mask[sizeof(m128)]; //!< High nibble mask in shufti.
+    u8 lo_mask[sizeof(m128)]; //!< Low nibble mask in shufti.
+    u8 bucket_select_mask[sizeof(m256)]; //!< Mask for bucket assigning.
+    u8 data_select_mask[sizeof(m256)]; //!< Shuffle mask for data ordering.
+    u32 hi_bits_mask; //!< High-bits used in multi-path validation.
+    u32 lo_bits_mask; //!< Low-bits used in multi-path validation.
+    u32 neg_mask; //!< 64 bits negation mask.
+    s32 base_offset; //!< Relative offset of the first byte.
+    s32 last_start; //!< The latest start offset among 8 paths.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_32x16 {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 hi_mask[sizeof(m256)]; //!< High nibble mask in shufti.
+    u8 lo_mask[sizeof(m256)]; //!< Low nibble mask in shufti.
+    u8 bucket_select_mask_hi[sizeof(m256)]; //!< Mask for bucket assigning.
+    u8 bucket_select_mask_lo[sizeof(m256)]; //!< Mask for bucket assigning.
+    u8 data_select_mask[sizeof(m256)]; //!< Shuffle mask for data ordering.
+    u32 hi_bits_mask; //!< High-bits used in multi-path validation.
+    u32 lo_bits_mask; //!< Low-bits used in multi-path validation.
+    u32 neg_mask; //!< 64 bits negation mask.
+    s32 base_offset; //!< Relative offset of the first byte.
+    s32 last_start; //!< The latest start offset among 8 paths.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_64 {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 hi_mask[sizeof(m128)]; //!< High nibble mask in shufti.
+    u8 lo_mask[sizeof(m128)]; //!< Low nibble mask in shufti.
+    u8 bucket_select_mask[2 * sizeof(m256)]; //!< Mask for bucket assigning.
+    u8 data_select_mask[2 * sizeof(m256)]; //!< Shuffle mask for data ordering.
+    u64a hi_bits_mask; //!< High-bits used in multi-path validation.
+    u64a lo_bits_mask; //!< Low-bits used in multi-path validation.
+    u64a neg_mask; //!< 64 bits negation mask.
+    s32 base_offset; //!< Relative offset of the first byte.
+    s32 last_start; //!< The latest start offset among 8 paths.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_INCLUDED_JUMP {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 squash; //!< FDR confirm squash mask for included literal.
+    u32 child_offset; //!< Program offset of included literal.
+};
+
+struct ROSE_STRUCT_SET_LOGICAL {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 lkey; //!< Logical key to set.
+    s32 offset_adjust; //!< offsetAdjust from struct Report triggers the flush.
+};
+
+struct ROSE_STRUCT_SET_COMBINATION {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 ckey; //!< Combination key to set.
+};
+
+struct ROSE_STRUCT_FLUSH_COMBINATION {
+    u8 code; //!< From enum RoseInstructionCode.
+};
+
+struct ROSE_STRUCT_SET_EXHAUST {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 ekey; //!< Exhaustion key.
+};
+
+struct ROSE_STRUCT_LAST_FLUSH_COMBINATION {
+    u8 code; //!< From enum RoseInstructionCode.
+};
+#endif // ROSE_ROSE_PROGRAM_H
diff --git a/regex/rose/rose_types.h b/regex/rose/rose_types.h
new file mode 100644
index 000000000..9dcef1cef
--- /dev/null
+++ b/regex/rose/rose_types.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Rose runtime types (callbacks, etc).
+ */
+
+#ifndef ROSE_TYPES_H
+#define ROSE_TYPES_H
+
+#include "ue2common.h"
+
+struct hs_scratch;
+
+/**
+ * \brief Continue without checking for exhaustion.
+ *
+ * \ref RoseCallback return value indicating that execution should continue and
+ * that it is not necessary to check if all reports have been exhausted.
+ */
+#define ROSE_CONTINUE_MATCHING_NO_EXHAUST 2
+
+/**
+ * \brief The type for a Rose callback.
+ *
+ * \return
+ *  - \ref MO_HALT_MATCHING if matching should terminate;
+ *  - \ref MO_CONTINUE_MATCHING if matching should continue;
+ *  - \ref ROSE_CONTINUE_MATCHING_NO_EXHAUST if matching should continue and no
+ *    exhaustion is possible.
+ */
+typedef int (*RoseCallback)(u64a offset, ReportID id,
+                            struct hs_scratch *scratch);
+
+/**
+ * \brief The type for a Rose callback which also tracks start of match.
+ *
+ * Behaves just like \ref RoseCallback except that it is provided with both a
+ * start and an end offset.
+ *
+ * \see RoseCallback
+ */
+typedef int (*RoseCallbackSom)(u64a from_offset, u64a to_offset, ReportID id,
+                               struct hs_scratch *scratch);
+
+#endif
diff --git a/regex/rose/runtime.h b/regex/rose/runtime.h
new file mode 100644
index 000000000..5fbb2b741
--- /dev/null
+++ b/regex/rose/runtime.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2015-2018, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Runtime functions shared between various Rose runtime code.
+ */
+
+#ifndef ROSE_RUNTIME_H
+#define ROSE_RUNTIME_H
+
+#include "rose_internal.h"
+#include "scratch.h"
+#include "util/partial_store.h"
+
+/*
+ * ROSE STATE LAYOUT:
+ *
+ * - runtime status byte (halt status, delay rebuild dirty, etc)
+ * - rose state multibit
+ * - active leaf array (multibit)
+ * - active leftfix array (multibit)
+ * - leftfix lag table
+ * - anchored matcher state
+ * - literal groups
+ * - history buffer
+ * - exhausted bitvector
+ * - som slots, som multibit arrays
+ * - nfa stream state (for each nfa)
+ */
+
+#define rose_inline really_inline
+
+/* Maximum offset that we will eagerly run prefixes to. Beyond this point, eager
+ * prefixes are always run in exactly the same way as normal prefixes. */
+#define EAGER_STOP_OFFSET 64
+
+
+static really_inline
+const void *getByOffset(const struct RoseEngine *t, u32 offset) {
+    assert(offset < t->size);
+    return (const u8 *)t + offset;
+}
+
+static really_inline
+void *getRoleState(char *state) {
+    return state + ROSE_STATE_OFFSET_ROLE_MMBIT;
+}
+
+/** \brief Fetch the active array for suffix nfas. */
+static really_inline
+u8 *getActiveLeafArray(const struct RoseEngine *t, char *state) {
+    return (u8 *)(state + t->stateOffsets.activeLeafArray);
+}
+
+/** \brief Fetch the active array for rose nfas. */
+static really_inline
+u8 *getActiveLeftArray(const struct RoseEngine *t, char *state) {
+    return (u8 *)(state + t->stateOffsets.activeLeftArray);
+}
+
+static really_inline
+rose_group loadGroups(const struct RoseEngine *t, const char *state) {
+    return partial_load_u64a(state + t->stateOffsets.groups,
+                             t->stateOffsets.groups_size);
+
+}
+
+static really_inline
+void storeGroups(const struct RoseEngine *t, char *state, rose_group groups) {
+    partial_store_u64a(state + t->stateOffsets.groups, groups,
+                       t->stateOffsets.groups_size);
+}
+
+static really_inline
+u8 *getLongLitState(const struct RoseEngine *t, char *state) {
+    return (u8 *)(state + t->stateOffsets.longLitState);
+}
+
+static really_inline
+u8 *getLeftfixLagTable(const struct RoseEngine *t, char *state) {
+    return (u8 *)(state + t->stateOffsets.leftfixLagTable);
+}
+
+static really_inline
+const u8 *getLeftfixLagTableConst(const struct RoseEngine *t,
+                                  const char *state) {
+    return (const u8 *)(state + t->stateOffsets.leftfixLagTable);
+}
+
+static really_inline
+u32 has_chained_nfas(const struct RoseEngine *t) {
+    return t->outfixBeginQueue;
+}
+
+static really_inline
+void updateLastMatchOffset(struct RoseContext *tctxt, u64a offset) {
+    DEBUG_PRINTF("match @%llu, last match @%llu\n", offset,
+                 tctxt->lastMatchOffset);
+
+    assert(offset >= tctxt->minMatchOffset);
+    assert(offset >= tctxt->lastMatchOffset);
+    tctxt->lastMatchOffset = offset;
+}
+
+static really_inline
+void updateLastCombMatchOffset(struct RoseContext *tctxt, u64a offset) {
+    DEBUG_PRINTF("match @%llu, last match @%llu\n", offset,
+                 tctxt->lastCombMatchOffset);
+
+    assert(offset >= tctxt->lastCombMatchOffset);
+    tctxt->lastCombMatchOffset = offset;
+}
+
+static really_inline
+void updateMinMatchOffset(struct RoseContext *tctxt, u64a offset) {
+    DEBUG_PRINTF("min match now @%llu, was @%llu\n", offset,
+                 tctxt->minMatchOffset);
+
+    assert(offset >= tctxt->minMatchOffset);
+    assert(offset >= tctxt->minNonMpvMatchOffset);
+    tctxt->minMatchOffset = offset;
+    tctxt->minNonMpvMatchOffset = offset;
+}
+
+static really_inline
+void updateMinMatchOffsetFromMpv(struct RoseContext *tctxt, u64a offset) {
+    DEBUG_PRINTF("min match now @%llu, was @%llu\n", offset,
+                 tctxt->minMatchOffset);
+
+    assert(offset >= tctxt->minMatchOffset);
+    assert(tctxt->minNonMpvMatchOffset >= tctxt->minMatchOffset);
+    tctxt->minMatchOffset = offset;
+    tctxt->minNonMpvMatchOffset = MAX(tctxt->minNonMpvMatchOffset, offset);
+}
+#endif
diff --git a/regex/rose/stream.c b/regex/rose/stream.c
new file mode 100644
index 000000000..26268dd57
--- /dev/null
+++ b/regex/rose/stream.c
@@ -0,0 +1,752 @@
+/*
+ * Copyright (c) 2015-2018, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "catchup.h"
+#include "counting_miracle.h"
+#include "infix.h"
+#include "match.h"
+#include "miracle.h"
+#include "program_runtime.h"
+#include "rose.h"
+#include "rose_internal.h"
+#include "stream_long_lit.h"
+#include "hwlm/hwlm.h"
+#include "nfa/mcclellan.h"
+#include "nfa/nfa_api.h"
+#include "nfa/nfa_api_queue.h"
+#include "nfa/nfa_internal.h"
+#include "util/fatbit.h"
+
+static rose_inline
+void runAnchoredTableStream(const struct RoseEngine *t, const void *atable,
+                            size_t alen, u64a offset,
+                            struct hs_scratch *scratch) {
+    char *state_base = scratch->core_info.state + t->stateOffsets.anchorState;
+    const struct anchored_matcher_info *curr = atable;
+
+    do {
+        DEBUG_PRINTF("--anchored nfa (+%u) no %u so %u\n",
+                     curr->anchoredMinDistance, curr->next_offset,
+                     curr->state_offset);
+        const struct NFA *nfa
+            = (const struct NFA *)((const char *)curr + sizeof(*curr));
+        assert(ISALIGNED_CL(nfa));
+        assert(isMcClellanType(nfa->type));
+
+        char *state = state_base + curr->state_offset;
+
+        char start = 0;
+        size_t adj = 0;
+
+        if (offset <= curr->anchoredMinDistance) {
+            adj = curr->anchoredMinDistance - offset;
+            if (adj >= alen) {
+                goto next_nfa;
+            }
+
+            start = 1;
+        } else {
+            // (No state decompress necessary.)
+            if (nfa->type == MCCLELLAN_NFA_8) {
+                if (!*(u8 *)state) {
+                    goto next_nfa;
+                }
+            } else {
+                if (!unaligned_load_u16(state)) {
+                    goto next_nfa;
+                }
+            }
+        }
+
+        if (nfa->type == MCCLELLAN_NFA_8) {
+            nfaExecMcClellan8_SimpStream(nfa, state, scratch->core_info.buf,
+                                         start, adj, alen, roseAnchoredCallback,
+                                         scratch);
+        } else {
+            nfaExecMcClellan16_SimpStream(nfa, state, scratch->core_info.buf,
+                                          start, adj, alen,
+                                          roseAnchoredCallback, scratch);
+        }
+
+    next_nfa:
+        if (!curr->next_offset) {
+            break;
+        }
+
+        curr = (const void *)((const char *)curr + curr->next_offset);
+    } while (1);
+}
+
+
+static really_inline
+void saveStreamState(const struct NFA *nfa, struct mq *q, s64a loc) {
+    DEBUG_PRINTF("offset=%llu, length=%zu, hlength=%zu, loc=%lld\n",
+                 q->offset, q->length, q->hlength, loc);
+    nfaQueueCompressState(nfa, q, loc);
+}
+
+static really_inline
+u8 getByteBefore(const struct core_info *ci, s64a sp) {
+    if (sp > 0) { // in main buffer
+        assert(sp <= (s64a)ci->len);
+        return ci->buf[sp - 1];
+    }
+    // in history buffer
+    assert(-sp < (s64a)ci->hlen);
+    return ci->hbuf[ci->hlen + sp - 1];
+}
+
+/** \brief Return value for \ref roseScanForMiracles. */
+enum MiracleAction {
+    MIRACLE_DEAD, //!< kill off this engine
+    MIRACLE_SAVED, //!< engine has been caught up and state saved
+    MIRACLE_CONTINUE //!< continue running and catch up engine
+};
+
+static really_inline
+enum MiracleAction roseScanForMiracles(const struct RoseEngine *t, char *state,
+                                       struct hs_scratch *scratch, u32 qi,
+                                       const struct LeftNfaInfo *left,
+                                       const struct NFA *nfa) {
+    struct core_info *ci = &scratch->core_info;
+    const u32 qCount = t->queueCount;
+    struct mq *q = scratch->queues + qi;
+
+    const char q_active = fatbit_isset(scratch->aqa, qCount, qi);
+    DEBUG_PRINTF("q_active=%d\n", q_active);
+
+    const s64a begin_loc = q_active ? q_cur_loc(q) : 0;
+    const s64a end_loc = ci->len;
+
+    s64a miracle_loc;
+    if (roseMiracleOccurs(t, left, ci, begin_loc, end_loc, &miracle_loc)) {
+        goto found_miracle;
+    }
+
+    if (roseCountingMiracleOccurs(t, left, ci, begin_loc, end_loc,
+                                  &miracle_loc)) {
+        goto found_miracle;
+    }
+
+    DEBUG_PRINTF("no miracle\n");
+    return MIRACLE_CONTINUE;
+
+found_miracle:
+    DEBUG_PRINTF("miracle at %lld\n", miracle_loc);
+
+    if (left->infix) {
+        if (!q_active) {
+            DEBUG_PRINTF("killing infix\n");
+            return MIRACLE_DEAD;
+        }
+
+        DEBUG_PRINTF("skip q forward, %lld to %lld\n", begin_loc, miracle_loc);
+        q_skip_forward_to(q, miracle_loc);
+        if (q_last_type(q) == MQE_START) {
+            DEBUG_PRINTF("miracle caused infix to die\n");
+            return MIRACLE_DEAD;
+        }
+
+        DEBUG_PRINTF("re-init infix state\n");
+        assert(q->items[q->cur].type == MQE_START);
+        q->items[q->cur].location = miracle_loc;
+        nfaQueueInitState(q->nfa, q);
+    } else {
+        if (miracle_loc > end_loc - t->historyRequired) {
+            char *streamState = state + getNfaInfoByQueue(t, qi)->stateOffset;
+            u64a offset = ci->buf_offset + miracle_loc;
+            u8 key = offset ? getByteBefore(ci, miracle_loc) : 0;
+            DEBUG_PRINTF("init state, key=0x%02x, offset=%llu\n", key, offset);
+            if (!nfaInitCompressedState(nfa, offset, streamState, key)) {
+                return MIRACLE_DEAD;
+            }
+            storeRoseDelay(t, state, left, (s64a)ci->len - miracle_loc);
+            return MIRACLE_SAVED;
+        }
+
+        DEBUG_PRINTF("re-init prefix (skip %lld->%lld)\n", begin_loc,
+                     miracle_loc);
+        if (!q_active) {
+            fatbit_set(scratch->aqa, qCount, qi);
+            initRoseQueue(t, qi, left, scratch);
+        }
+        q->cur = q->end = 0;
+        pushQueueAt(q, 0, MQE_START, miracle_loc);
+        pushQueueAt(q, 1, MQE_TOP, miracle_loc);
+        nfaQueueInitState(q->nfa, q);
+    }
+
+    return MIRACLE_CONTINUE;
+}
+
+
+static really_inline
+char roseCatchUpLeftfix(const struct RoseEngine *t, char *state,
+                        struct hs_scratch *scratch, u32 qi,
+                        const struct LeftNfaInfo *left) {
+    assert(!left->transient); // active roses only
+
+    struct core_info *ci = &scratch->core_info;
+    const u32 qCount = t->queueCount;
+    struct mq *q = scratch->queues + qi;
+    const struct NFA *nfa = getNfaByQueue(t, qi);
+
+    if (nfaSupportsZombie(nfa)
+        && ci->buf_offset /* prefix can be alive with no q */
+        && !fatbit_isset(scratch->aqa, qCount, qi)
+        && isZombie(t, state, left)) {
+        DEBUG_PRINTF("yawn - zombie\n");
+        return 1;
+    }
+
+    if (left->stopTable) {
+        enum MiracleAction mrv =
+            roseScanForMiracles(t, state, scratch, qi, left, nfa);
+        switch (mrv) {
+        case MIRACLE_DEAD:
+            return 0;
+        case MIRACLE_SAVED:
+            return 1;
+        default:
+            assert(mrv == MIRACLE_CONTINUE);
+            break;
+        }
+    }
+
+    if (!fatbit_set(scratch->aqa, qCount, qi)) {
+        initRoseQueue(t, qi, left, scratch);
+
+        s32 sp;
+        if (ci->buf_offset) {
+            sp = -(s32)loadRoseDelay(t, state, left);
+        } else {
+            sp = 0;
+        }
+
+        DEBUG_PRINTF("ci->len=%zu, sp=%d, historyRequired=%u\n", ci->len, sp,
+                     t->historyRequired);
+
+        if ( ci->len - sp + 1 < t->historyRequired) {
+            // we'll end up safely in the history region.
+            DEBUG_PRINTF("safely in history, skipping\n");
+            storeRoseDelay(t, state, left, (s64a)ci->len - sp);
+            return 1;
+        }
+
+        pushQueueAt(q, 0, MQE_START, sp);
+        if (left->infix || ci->buf_offset + sp > 0) {
+            loadStreamState(nfa, q, sp);
+        } else {
+            pushQueueAt(q, 1, MQE_TOP, sp);
+            nfaQueueInitState(nfa, q);
+        }
+    } else {
+        DEBUG_PRINTF("queue already active\n");
+        if (q->end - q->cur == 1 && q_cur_type(q) == MQE_START) {
+            DEBUG_PRINTF("empty queue, start loc=%lld\n", q_cur_loc(q));
+            s64a last_loc = q_cur_loc(q);
+            if (ci->len - last_loc + 1 < t->historyRequired) {
+                // we'll end up safely in the history region.
+                DEBUG_PRINTF("safely in history, saving state and skipping\n");
+                saveStreamState(nfa, q, last_loc);
+                storeRoseDelay(t, state, left, (s64a)ci->len - last_loc);
+                return 1;
+            }
+        }
+    }
+
+    // Determine whether the byte before last_loc will be in the history
+    // buffer on the next stream write.
+    s64a last_loc = q_last_loc(q);
+    s64a leftovers = ci->len - last_loc;
+    if (leftovers + 1 >= t->historyRequired) {
+        u32 catchup_offset = left->maxLag ? left->maxLag - 1 : 0;
+        last_loc = (s64a)ci->len - catchup_offset;
+    }
+
+    if (left->infix) {
+        if (infixTooOld(q, last_loc)) {
+            DEBUG_PRINTF("infix died of old age\n");
+            return 0;
+        }
+        reduceInfixQueue(q, last_loc, left->maxQueueLen, q->nfa->maxWidth);
+    }
+
+    DEBUG_PRINTF("end scan at %lld\n", last_loc);
+    pushQueueNoMerge(q, MQE_END, last_loc);
+
+#ifdef DEBUG
+    debugQueue(q);
+#endif
+
+    char rv = nfaQueueExecRose(nfa, q, MO_INVALID_IDX);
+    if (!rv) { /* nfa is dead */
+        DEBUG_PRINTF("died catching up to stream boundary\n");
+        return 0;
+    } else {
+        DEBUG_PRINTF("alive, saving stream state\n");
+        if (nfaSupportsZombie(nfa) &&
+            nfaGetZombieStatus(nfa, q, last_loc) == NFA_ZOMBIE_ALWAYS_YES) {
+            DEBUG_PRINTF("not so fast - zombie\n");
+            setAsZombie(t, state, left);
+        } else {
+            saveStreamState(nfa, q, last_loc);
+            storeRoseDelay(t, state, left, (s64a)ci->len - last_loc);
+        }
+    }
+
+    return 1;
+}
+
+static rose_inline
+void roseCatchUpLeftfixes(const struct RoseEngine *t, char *state,
+                          struct hs_scratch *scratch) {
+    if (!t->activeLeftIterOffset) {
+        // No sparse iter, no non-transient roses.
+        return;
+    }
+
+    // As per UE-1629, we catch up leftfix engines to:
+    //  * current position (last location in the queue, or last location we
+    //    executed to if the queue is empty) if that position (and the byte
+    //    before so we can decompress the stream state) will be in the history
+    //    buffer on the next stream write; OR
+    //  * (stream_boundary - max_delay) other
+
+    u8 *ara = getActiveLeftArray(t, state); /* indexed by offsets into
+                                             * left_table */
+    const u32 arCount = t->activeLeftCount;
+    const struct LeftNfaInfo *left_table = getLeftTable(t);
+    const struct mmbit_sparse_iter *it = getActiveLeftIter(t);
+
+    struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES];
+
+    u32 idx = 0;
+    u32 ri = mmbit_sparse_iter_begin(ara, arCount, &idx, it, si_state);
+    for (; ri != MMB_INVALID;
+           ri = mmbit_sparse_iter_next(ara, arCount, ri, &idx, it, si_state)) {
+        const struct LeftNfaInfo *left = left_table + ri;
+        u32 qi = ri + t->leftfixBeginQueue;
+        DEBUG_PRINTF("leftfix %u of %u, maxLag=%u, infix=%d\n", ri, arCount,
+                     left->maxLag, (int)left->infix);
+        if (!roseCatchUpLeftfix(t, state, scratch, qi, left)) {
+            DEBUG_PRINTF("removing rose %u from active list\n", ri);
+            DEBUG_PRINTF("groups old=%016llx mask=%016llx\n",
+                         scratch->tctxt.groups, left->squash_mask);
+            scratch->tctxt.groups &= left->squash_mask;
+            mmbit_unset(ara, arCount, ri);
+        }
+    }
+}
+
+// Saves out stream state for all our active suffix NFAs.
+static rose_inline
+void roseSaveNfaStreamState(const struct RoseEngine *t, char *state,
+                            struct hs_scratch *scratch) {
+    struct mq *queues = scratch->queues;
+    u8 *aa = getActiveLeafArray(t, state);
+    u32 aaCount = t->activeArrayCount;
+
+    if (scratch->tctxt.mpv_inactive) {
+        DEBUG_PRINTF("mpv is dead as a doornail\n");
+        /* mpv if it exists is queue 0 */
+        mmbit_unset(aa, aaCount, 0);
+    }
+
+    for (u32 qi = mmbit_iterate(aa, aaCount, MMB_INVALID); qi != MMB_INVALID;
+         qi = mmbit_iterate(aa, aaCount, qi)) {
+        DEBUG_PRINTF("saving stream state for qi=%u\n", qi);
+
+        struct mq *q = queues + qi;
+
+        // If it's active, it should have an active queue (as we should have
+        // done some work!)
+        assert(fatbit_isset(scratch->aqa, t->queueCount, qi));
+
+        const struct NFA *nfa = getNfaByQueue(t, qi);
+        saveStreamState(nfa, q, q_cur_loc(q));
+    }
+}
+
+static rose_inline
+void ensureStreamNeatAndTidy(const struct RoseEngine *t, char *state,
+                             struct hs_scratch *scratch, size_t length,
+                             u64a offset) {
+    struct RoseContext *tctxt = &scratch->tctxt;
+
+    if (roseCatchUpTo(t, scratch, length + scratch->core_info.buf_offset) ==
+        HWLM_TERMINATE_MATCHING) {
+        return; /* dead; no need to clean up state. */
+    }
+    roseSaveNfaStreamState(t, state, scratch);
+    roseCatchUpLeftfixes(t, state, scratch);
+    roseFlushLastByteHistory(t, scratch, offset + length);
+    tctxt->lastEndOffset = offset + length;
+    storeGroups(t, state, tctxt->groups);
+    storeLongLiteralState(t, state, scratch);
+}
+
+static really_inline
+void do_rebuild(const struct RoseEngine *t, struct hs_scratch *scratch) {
+    assert(t->drmatcherOffset);
+    assert(!can_stop_matching(scratch));
+
+    const struct HWLM *hwlm = getByOffset(t, t->drmatcherOffset);
+    size_t len = MIN(scratch->core_info.hlen, t->delayRebuildLength);
+    const u8 *buf = scratch->core_info.hbuf + scratch->core_info.hlen - len;
+    DEBUG_PRINTF("BEGIN FLOATING REBUILD over %zu bytes\n", len);
+
+    scratch->core_info.status &= ~STATUS_DELAY_DIRTY;
+
+    hwlmExec(hwlm, buf, len, 0, roseDelayRebuildCallback, scratch,
+             scratch->tctxt.groups);
+    assert(!can_stop_matching(scratch));
+}
+
+static rose_inline
+void runEagerPrefixesStream(const struct RoseEngine *t,
+                            struct hs_scratch *scratch) {
+    if (!t->eagerIterOffset
+        || scratch->core_info.buf_offset >= EAGER_STOP_OFFSET) {
+        return;
+    }
+
+    char *state = scratch->core_info.state;
+    u8 *ara = getActiveLeftArray(t, state); /* indexed by offsets into
+                                             * left_table */
+    const u32 arCount = t->activeLeftCount;
+    const u32 qCount = t->queueCount;
+    const struct LeftNfaInfo *left_table = getLeftTable(t);
+    const struct mmbit_sparse_iter *it = getByOffset(t, t->eagerIterOffset);
+
+    struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES];
+
+    u32 idx = 0;
+    u32 ri = mmbit_sparse_iter_begin(ara, arCount, &idx, it, si_state);
+    for (; ri != MMB_INVALID;
+           ri = mmbit_sparse_iter_next(ara, arCount, ri, &idx, it, si_state)) {
+        const struct LeftNfaInfo *left = left_table + ri;
+        u32 qi = ri + t->leftfixBeginQueue;
+        DEBUG_PRINTF("leftfix %u of %u, maxLag=%u\n", ri, arCount, left->maxLag);
+
+        assert(!fatbit_isset(scratch->aqa, qCount, qi));
+        assert(left->eager);
+        assert(!left->infix);
+
+        struct mq *q = scratch->queues + qi;
+        const struct NFA *nfa = getNfaByQueue(t, qi);
+        s64a loc = MIN(scratch->core_info.len,
+                       EAGER_STOP_OFFSET - scratch->core_info.buf_offset);
+
+        fatbit_set(scratch->aqa, qCount, qi);
+        initRoseQueue(t, qi, left, scratch);
+
+        if (scratch->core_info.buf_offset) {
+            s64a sp = left->transient ? -(s64a)scratch->core_info.hlen
+                                      : -(s64a)loadRoseDelay(t, state, left);
+            pushQueueAt(q, 0, MQE_START, sp);
+            if (scratch->core_info.buf_offset + sp > 0) {
+                loadStreamState(nfa, q, sp);
+                /* if the leftfix fix is currently in a match state, we cannot
+                 * advance it. */
+                if (nfaInAnyAcceptState(nfa, q)) {
+                    continue;
+                }
+                pushQueueAt(q, 1, MQE_END, loc);
+            } else {
+                pushQueueAt(q, 1, MQE_TOP, sp);
+                pushQueueAt(q, 2, MQE_END, loc);
+                nfaQueueInitState(q->nfa, q);
+            }
+        } else {
+            pushQueueAt(q, 0, MQE_START, 0);
+            pushQueueAt(q, 1, MQE_TOP, 0);
+            pushQueueAt(q, 2, MQE_END, loc);
+            nfaQueueInitState(nfa, q);
+        }
+
+        char alive = nfaQueueExecToMatch(q->nfa, q, loc);
+
+        if (!alive) {
+            DEBUG_PRINTF("queue %u dead, squashing\n", qi);
+            mmbit_unset(ara, arCount, ri);
+            fatbit_unset(scratch->aqa, qCount, qi);
+            scratch->tctxt.groups &= left->squash_mask;
+        } else if (q->cur == q->end) {
+            assert(alive != MO_MATCHES_PENDING);
+            /* unlike in block mode we cannot squash groups if there is no match
+             * in this block as we need the groups on for later stream writes */
+            /* TODO: investigate possibility of a method to suppress groups for
+             * a single stream block. */
+            DEBUG_PRINTF("queue %u finished, nfa lives\n", qi);
+            q->cur = q->end = 0;
+            pushQueueAt(q, 0, MQE_START, loc);
+        } else {
+            assert(alive == MO_MATCHES_PENDING);
+            DEBUG_PRINTF("queue %u unfinished, nfa lives\n", qi);
+            q->end--; /* remove end item */
+        }
+    }
+}
+
+static really_inline
+int can_never_match(const struct RoseEngine *t, char *state,
+                    struct hs_scratch *scratch, size_t length, u64a offset) {
+    struct RoseContext *tctxt = &scratch->tctxt;
+
+    if (tctxt->groups) {
+        DEBUG_PRINTF("still has active groups\n");
+        return 0;
+    }
+
+    if (offset + length <= t->anchoredDistance) { /* not < as may have eod */
+        DEBUG_PRINTF("still in anchored region\n");
+        return 0;
+    }
+
+    if (t->lastByteHistoryIterOffset) { /* last byte history is hard */
+        DEBUG_PRINTF("last byte history\n");
+        return 0;
+    }
+
+    if (mmbit_any(getActiveLeafArray(t, state), t->activeArrayCount)) {
+        DEBUG_PRINTF("active leaf\n");
+        return 0;
+    }
+
+    return 1;
+}
+
+void roseStreamExec(const struct RoseEngine *t, struct hs_scratch *scratch) {
+    DEBUG_PRINTF("OH HAI [%llu, %llu)\n", scratch->core_info.buf_offset,
+                 scratch->core_info.buf_offset + (u64a)scratch->core_info.len);
+    assert(t);
+    assert(scratch->core_info.hbuf);
+    assert(scratch->core_info.buf);
+
+    // We should not have been called if we've already been told to terminate
+    // matching.
+    assert(!told_to_stop_matching(scratch));
+
+    assert(mmbit_sparse_iter_state_size(t->rolesWithStateCount)
+           < MAX_SPARSE_ITER_STATES);
+
+    size_t length = scratch->core_info.len;
+    u64a offset = scratch->core_info.buf_offset;
+
+    // We may have a maximum width (for engines constructed entirely
+    // of bi-anchored patterns). If this write would result in us progressing
+    // beyond this point, we cannot possibly match.
+    if (t->maxBiAnchoredWidth != ROSE_BOUND_INF
+        && offset + length > t->maxBiAnchoredWidth) {
+        DEBUG_PRINTF("bailing, write would progress beyond maxBAWidth\n");
+        return;
+    }
+
+    char *state = scratch->core_info.state;
+
+    struct RoseContext *tctxt = &scratch->tctxt;
+    tctxt->mpv_inactive = 0;
+    tctxt->groups = loadGroups(t, state);
+    tctxt->lit_offset_adjust = offset + 1; // index after last byte
+    tctxt->delayLastEndOffset = offset;
+    tctxt->lastEndOffset = offset;
+    tctxt->filledDelayedSlots = 0;
+    tctxt->lastMatchOffset = 0;
+    tctxt->lastCombMatchOffset = offset;
+    tctxt->minMatchOffset = offset;
+    tctxt->minNonMpvMatchOffset = offset;
+    tctxt->next_mpv_offset = 0;
+
+    DEBUG_PRINTF("BEGIN: history len=%zu, buffer len=%zu groups=%016llx\n",
+                 scratch->core_info.hlen, scratch->core_info.len, tctxt->groups);
+
+    fatbit_clear(scratch->aqa);
+    scratch->al_log_sum = 0;
+    scratch->catchup_pq.qm_size = 0;
+
+    if (t->outfixBeginQueue != t->outfixEndQueue) {
+        streamInitSufPQ(t, state, scratch);
+    }
+
+    runEagerPrefixesStream(t, scratch);
+
+    u32 alen = t->anchoredDistance > offset ?
+        MIN(length + offset, t->anchoredDistance) - offset : 0;
+
+    const struct anchored_matcher_info *atable = getALiteralMatcher(t);
+    if (atable && alen) {
+        DEBUG_PRINTF("BEGIN ANCHORED %zu/%u\n", scratch->core_info.hlen, alen);
+        runAnchoredTableStream(t, atable, alen, offset, scratch);
+
+        if (can_stop_matching(scratch)) {
+            goto exit;
+        }
+    }
+
+    const struct HWLM *ftable = getFLiteralMatcher(t);
+    if (ftable) {
+        // Load in long literal table state and set up "fake history" buffers
+        // (ll_buf, etc, used by the CHECK_LONG_LIT instruction). Note that this
+        // must be done here in order to ensure that it happens before any path
+        // that leads to storeLongLiteralState(), which relies on these buffers.
+        loadLongLiteralState(t, state, scratch);
+
+        if (t->noFloatingRoots && !roseHasInFlightMatches(t, state, scratch)) {
+            DEBUG_PRINTF("skip FLOATING: no inflight matches\n");
+            goto flush_delay_and_exit;
+        }
+
+        size_t flen = length;
+        if (t->floatingDistance != ROSE_BOUND_INF) {
+            flen = t->floatingDistance > offset ?
+                MIN(t->floatingDistance, length + offset) - offset : 0;
+        }
+
+        size_t hlength = scratch->core_info.hlen;
+
+        char rebuild = hlength &&
+                       (scratch->core_info.status & STATUS_DELAY_DIRTY) &&
+                       (t->maxFloatingDelayedMatch == ROSE_BOUND_INF ||
+                        offset < t->maxFloatingDelayedMatch);
+        DEBUG_PRINTF("**rebuild %hhd status %hhu mfdm %u, offset %llu\n",
+                     rebuild, scratch->core_info.status,
+                     t->maxFloatingDelayedMatch, offset);
+
+        if (rebuild) { /* rebuild floating delayed match stuff */
+            do_rebuild(t, scratch);
+        }
+
+        if (!flen) {
+            goto flush_delay_and_exit;
+        }
+
+        if (flen + offset <= t->floatingMinDistance) {
+            DEBUG_PRINTF("skip FLOATING: before floating min\n");
+            goto flush_delay_and_exit;
+        }
+
+        size_t start = 0;
+        if (offset < t->floatingMinDistance) {
+            // This scan crosses the floating min distance, so we can use that
+            // to set HWLM's "start" offset.
+            start = t->floatingMinDistance - offset;
+        }
+        DEBUG_PRINTF("start=%zu\n", start);
+
+        DEBUG_PRINTF("BEGIN FLOATING (over %zu/%zu)\n", flen, length);
+        hwlmExecStreaming(ftable, flen, start, roseFloatingCallback, scratch,
+                          tctxt->groups & t->floating_group_mask);
+    }
+
+flush_delay_and_exit:
+    DEBUG_PRINTF("flushing floating\n");
+    if (cleanUpDelayed(t, scratch, length, offset) == HWLM_TERMINATE_MATCHING) {
+        return;
+    }
+
+exit:
+    DEBUG_PRINTF("CLEAN UP TIME\n");
+    if (!can_stop_matching(scratch)) {
+        ensureStreamNeatAndTidy(t, state, scratch, length, offset);
+    }
+
+    if (!told_to_stop_matching(scratch)
+        && can_never_match(t, state, scratch, length, offset)) {
+        DEBUG_PRINTF("PATTERN SET IS EXHAUSTED\n");
+        scratch->core_info.status = STATUS_EXHAUSTED;
+        return;
+    }
+
+    DEBUG_PRINTF("DONE STREAMING SCAN, status = %u\n",
+                 scratch->core_info.status);
+    return;
+}
+
+static rose_inline
+void roseStreamInitEod(const struct RoseEngine *t, u64a offset,
+                       struct hs_scratch *scratch) {
+    struct RoseContext *tctxt = &scratch->tctxt;
+    /* TODO: diff groups for eod */
+    tctxt->groups = loadGroups(t, scratch->core_info.state);
+    tctxt->lit_offset_adjust = scratch->core_info.buf_offset
+                             - scratch->core_info.hlen
+                             + 1; // index after last byte
+    tctxt->delayLastEndOffset = offset;
+    tctxt->lastEndOffset = offset;
+    tctxt->filledDelayedSlots = 0;
+    tctxt->lastMatchOffset = 0;
+    tctxt->lastCombMatchOffset = offset; /* DO NOT set 0 here! */
+    tctxt->minMatchOffset = offset;
+    tctxt->minNonMpvMatchOffset = offset;
+    tctxt->next_mpv_offset = offset;
+
+    scratch->catchup_pq.qm_size = 0;
+    scratch->al_log_sum = 0; /* clear the anchored logs */
+
+    fatbit_clear(scratch->aqa);
+}
+
+void roseStreamEodExec(const struct RoseEngine *t, u64a offset,
+                       struct hs_scratch *scratch) {
+    assert(scratch);
+    assert(t->requiresEodCheck);
+    DEBUG_PRINTF("ci buf %p/%zu his %p/%zu\n", scratch->core_info.buf,
+                 scratch->core_info.len, scratch->core_info.hbuf,
+                 scratch->core_info.hlen);
+
+    // We should not have been called if we've already been told to terminate
+    // matching.
+    assert(!told_to_stop_matching(scratch));
+
+    if (t->maxBiAnchoredWidth != ROSE_BOUND_INF
+        && offset > t->maxBiAnchoredWidth) {
+        DEBUG_PRINTF("bailing, we are beyond max width\n");
+        /* also some of the history/state may be stale */
+        return;
+    }
+
+    if (!t->eodProgramOffset) {
+        DEBUG_PRINTF("no eod program\n");
+        return;
+    }
+
+    roseStreamInitEod(t, offset, scratch);
+
+    DEBUG_PRINTF("running eod program at %u\n", t->eodProgramOffset);
+
+    // There should be no pending delayed literals.
+    assert(!scratch->tctxt.filledDelayedSlots);
+
+    const u64a som = 0;
+    const u8 flags = ROSE_PROG_FLAG_SKIP_MPV_CATCHUP;
+
+    // Note: we ignore the result, as this is the last thing to ever happen on
+    // a scan.
+    roseRunProgram(t, scratch, t->eodProgramOffset, som, offset, flags);
+}
diff --git a/regex/rose/stream_long_lit.h b/regex/rose/stream_long_lit.h
new file mode 100644
index 000000000..df9b57f4e
--- /dev/null
+++ b/regex/rose/stream_long_lit.h
@@ -0,0 +1,372 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef STREAM_LONG_LIT_H
+#define STREAM_LONG_LIT_H
+
+#include "rose.h"
+#include "rose_common.h"
+#include "rose_internal.h"
+#include "stream_long_lit_hash.h"
+#include "util/compare.h"
+#include "util/copybytes.h"
+
+static really_inline
+const struct RoseLongLitHashEntry *
+getHashTableBase(const struct RoseLongLitTable *ll_table,
+                 const struct RoseLongLitSubtable *ll_sub) {
+    assert(ll_sub->hashOffset);
+    return (const struct RoseLongLitHashEntry *)((const char *)ll_table +
+                                                 ll_sub->hashOffset);
+}
+
+// Reads from stream state and unpacks values into stream state table.
+static really_inline
+void loadLongLitStreamState(const struct RoseLongLitTable *ll_table,
+                            const u8 *ll_state, u32 *state_case,
+                            u32 *state_nocase) {
+    assert(ll_table);
+    assert(ll_state);
+    assert(state_case && state_nocase);
+
+    u8 ss_bytes = ll_table->streamStateBytes;
+    u8 ssb = ll_table->caseful.streamStateBits;
+    UNUSED u8 ssb_nc = ll_table->nocase.streamStateBits;
+    assert(ss_bytes == (ssb + ssb_nc + 7) / 8);
+
+#if defined(ARCH_32_BIT)
+    // On 32-bit hosts, we may be able to avoid having to do any u64a
+    // manipulation at all.
+    if (ss_bytes <= 4) {
+        u32 ssb_mask = (1U << ssb) - 1;
+        u32 streamVal = partial_load_u32(ll_state, ss_bytes);
+        *state_case = (u32)(streamVal & ssb_mask);
+        *state_nocase = (u32)(streamVal >> ssb);
+        return;
+    }
+#endif
+
+    u64a ssb_mask = (1ULL << ssb) - 1;
+    u64a streamVal = partial_load_u64a(ll_state, ss_bytes);
+    *state_case = (u32)(streamVal & ssb_mask);
+    *state_nocase = (u32)(streamVal >> ssb);
+}
+
+static rose_inline
+void loadLongLiteralStateMode(struct hs_scratch *scratch,
+                              const struct RoseLongLitTable *ll_table,
+                              const struct RoseLongLitSubtable *ll_sub,
+                              const u32 state, const char nocase) {
+    if (!state) {
+        DEBUG_PRINTF("no state for %s\n", nocase ? "caseless" : "caseful");
+        return;
+    }
+
+    const struct RoseLongLitHashEntry *tab = getHashTableBase(ll_table, ll_sub);
+    const struct RoseLongLitHashEntry *ent = tab + state - 1;
+
+    assert(ent->str_offset + ent->str_len <= ll_table->size);
+    const u8 *found_buf = (const u8 *)ll_table + ent->str_offset;
+    size_t found_sz = ent->str_len;
+
+    struct RoseContext *tctxt = &scratch->tctxt;
+    if (nocase) {
+        tctxt->ll_buf_nocase = found_buf;
+        tctxt->ll_len_nocase = found_sz;
+    } else {
+        tctxt->ll_buf = found_buf;
+        tctxt->ll_len = found_sz;
+    }
+}
+
+static rose_inline
+void loadLongLiteralState(const struct RoseEngine *t, char *state,
+                          struct hs_scratch *scratch) {
+    if (!t->longLitTableOffset) {
+        return;
+    }
+
+    // If we don't have any long literals in play, these values must point to
+    // the real history buffer so that CHECK_LONG_LIT instructions examine the
+    // history buffer.
+    scratch->tctxt.ll_buf = scratch->core_info.hbuf;
+    scratch->tctxt.ll_len = scratch->core_info.hlen;
+    scratch->tctxt.ll_buf_nocase = scratch->core_info.hbuf;
+    scratch->tctxt.ll_len_nocase = scratch->core_info.hlen;
+
+    if (!scratch->core_info.hlen) {
+        return;
+    }
+
+    const struct RoseLongLitTable *ll_table =
+        getByOffset(t, t->longLitTableOffset);
+    const u8 *ll_state = getLongLitState(t, state);
+
+    u32 state_case;
+    u32 state_nocase;
+    loadLongLitStreamState(ll_table, ll_state, &state_case, &state_nocase);
+
+    DEBUG_PRINTF("loaded {%u, %u}\n", state_case, state_nocase);
+
+    loadLongLiteralStateMode(scratch, ll_table, &ll_table->caseful,
+                             state_case, 0);
+    loadLongLiteralStateMode(scratch, ll_table, &ll_table->nocase,
+                             state_nocase, 1);
+}
+
+static rose_inline
+char confirmLongLiteral(const struct RoseLongLitTable *ll_table,
+                        const struct hs_scratch *scratch,
+                        const struct RoseLongLitHashEntry *ent,
+                        const char nocase) {
+    assert(ent->str_offset + ent->str_len <= ll_table->size);
+    const u8 *s = (const u8 *)ll_table + ent->str_offset;
+    size_t len = ent->str_len;
+    const u8 *buf = scratch->core_info.buf;
+    const size_t buf_len = scratch->core_info.len;
+
+    if (len > buf_len) {
+        const struct RoseContext *tctxt = &scratch->tctxt;
+        const u8 *hist = nocase ? tctxt->ll_buf_nocase : tctxt->ll_buf;
+        size_t hist_len = nocase ? tctxt->ll_len_nocase : tctxt->ll_len;
+
+        if (len > buf_len + hist_len) {
+            return 0; // Break out - not enough total history
+        }
+
+        size_t overhang = len - buf_len;
+        assert(overhang <= hist_len);
+
+        if (cmpForward(hist + hist_len - overhang, s, overhang, nocase)) {
+            return 0;
+        }
+        s += overhang;
+        len -= overhang;
+    }
+
+    // if we got here, we don't need history or we compared ok out of history
+    assert(len <= buf_len);
+
+    if (cmpForward(buf + buf_len - len, s, len, nocase)) {
+        return 0;
+    }
+
+    return 1;
+}
+
+static rose_inline
+const u8 *prepScanBuffer(const struct core_info *ci,
+                         const struct RoseLongLitTable *ll_table, u8 *tempbuf) {
+    const u8 hash_len = ll_table->maxLen;
+    assert(hash_len >= LONG_LIT_HASH_LEN);
+
+    // Our hash function operates over LONG_LIT_HASH_LEN bytes, starting from
+    // location (end of buffer - hash_len). If this block can be satisfied
+    // entirely from either the current buffer or the history buffer, we pass
+    // in the pointer directly; otherwise we must make a copy.
+
+    const u8 *base;
+
+    if (hash_len > ci->len) {
+        size_t overhang = hash_len - ci->len;
+        if (overhang >= LONG_LIT_HASH_LEN) {
+            // Can read enough to hash from inside the history buffer.
+            assert(overhang <= ci->hlen);
+            base = ci->hbuf + ci->hlen - overhang;
+        } else {
+            // Copy: first chunk from history buffer.
+            assert(overhang <= ci->hlen);
+            copy_upto_64_bytes(tempbuf, ci->hbuf + ci->hlen - overhang,
+                               overhang);
+            // Copy: second chunk from current buffer.
+            size_t copy_buf_len = LONG_LIT_HASH_LEN - overhang;
+            assert(copy_buf_len <= ci->len);
+            copy_upto_64_bytes(tempbuf + overhang, ci->buf, copy_buf_len);
+            // Read from our temporary buffer for the hash.
+            base = tempbuf;
+        }
+    } else {
+        // Can read enough to hash from inside the current buffer.
+        base = ci->buf + ci->len - hash_len;
+    }
+
+    return base;
+}
+
+#ifndef NDEBUG
+// Defensive checking (used in assert) that these table values don't overflow
+// the range available.
+static really_inline
+char streamingTableOverflow(u32 state_case, u32 state_nocase, u8 ssb,
+                            u8 ssb_nc) {
+    u32 ssb_mask = (1ULL << (ssb)) - 1;
+    if (state_case & ~ssb_mask) {
+        return 1;
+    }
+    u32 ssb_nc_mask = (1ULL << (ssb_nc)) - 1;
+    if (state_nocase & ~ssb_nc_mask) {
+        return 1;
+    }
+    return 0;
+}
+#endif
+
+// Reads from stream state table and packs values into stream state.
+static rose_inline
+void storeLongLitStreamState(const struct RoseLongLitTable *ll_table,
+                             u8 *ll_state, u32 state_case, u32 state_nocase) {
+    assert(ll_table);
+    assert(ll_state);
+
+    u8 ss_bytes = ll_table->streamStateBytes;
+    u8 ssb = ll_table->caseful.streamStateBits;
+    UNUSED u8 ssb_nc = ll_table->nocase.streamStateBits;
+    assert(ss_bytes == ROUNDUP_N(ssb + ssb_nc, 8) / 8);
+    assert(!streamingTableOverflow(state_case, state_nocase, ssb, ssb_nc));
+
+#if defined(ARCH_32_BIT)
+    // On 32-bit hosts, we may be able to avoid having to do any u64a
+    // manipulation at all.
+    if (ss_bytes <= 4) {
+        u32 stagingStreamState = state_case;
+        stagingStreamState |= (state_nocase << ssb);
+        partial_store_u32(ll_state, stagingStreamState, ss_bytes);
+        return;
+    }
+#endif
+
+    u64a stagingStreamState = (u64a)state_case;
+    stagingStreamState |= (u64a)state_nocase << ssb;
+    partial_store_u64a(ll_state, stagingStreamState, ss_bytes);
+}
+
+static really_inline
+char has_bit(const u8 *data, u32 bit) {
+    return (data[bit / 8] >> (bit % 8)) & 1;
+}
+
+static rose_inline
+char bloomHasKey(const u8 *bloom, u32 bloom_mask, u32 hash) {
+    return has_bit(bloom, hash & bloom_mask);
+}
+
+static rose_inline
+char checkBloomFilter(const struct RoseLongLitTable *ll_table,
+                      const struct RoseLongLitSubtable *ll_sub,
+                      const u8 *scan_buf, char nocase) {
+    assert(ll_sub->bloomBits);
+
+    const u8 *bloom = (const u8 *)ll_table + ll_sub->bloomOffset;
+    const u32 bloom_mask = (1U << ll_sub->bloomBits) - 1;
+
+    char v = 1;
+    v &= bloomHasKey(bloom, bloom_mask, bloomHash_1(scan_buf, nocase));
+    v &= bloomHasKey(bloom, bloom_mask, bloomHash_2(scan_buf, nocase));
+    v &= bloomHasKey(bloom, bloom_mask, bloomHash_3(scan_buf, nocase));
+    return v;
+}
+
+/**
+ * \brief Look for a hit in the hash table.
+ *
+ * Returns zero if not found, otherwise returns (bucket + 1).
+ */
+static rose_inline
+u32 checkHashTable(const struct RoseLongLitTable *ll_table,
+                   const struct RoseLongLitSubtable *ll_sub, const u8 *scan_buf,
+                   const struct hs_scratch *scratch, char nocase) {
+    const u32 nbits = ll_sub->hashBits;
+    assert(nbits && nbits < 32);
+    const u32 num_entries = 1U << nbits;
+
+    const struct RoseLongLitHashEntry *tab = getHashTableBase(ll_table, ll_sub);
+
+    u32 hash = hashLongLiteral(scan_buf, LONG_LIT_HASH_LEN, nocase);
+    u32 bucket = hash & ((1U << nbits) - 1);
+
+    while (tab[bucket].str_offset != 0) {
+        DEBUG_PRINTF("checking bucket %u\n", bucket);
+        if (confirmLongLiteral(ll_table, scratch, &tab[bucket], nocase)) {
+            DEBUG_PRINTF("found hit for bucket %u\n", bucket);
+            return bucket + 1;
+        }
+
+        if (++bucket == num_entries) {
+            bucket = 0;
+        }
+    }
+
+    return 0;
+}
+
+static rose_inline
+void storeLongLiteralState(const struct RoseEngine *t, char *state,
+                           struct hs_scratch *scratch) {
+    if (!t->longLitTableOffset) {
+        DEBUG_PRINTF("no table\n");
+        return;
+    }
+
+    struct core_info *ci = &scratch->core_info;
+    const struct RoseLongLitTable *ll_table =
+        getByOffset(t, t->longLitTableOffset);
+    assert(ll_table->maxLen);
+
+    DEBUG_PRINTF("maxLen=%u, len=%zu, hlen=%zu\n", ll_table->maxLen, ci->len,
+                 ci->hlen);
+
+    u32 state_case = 0;
+    u32 state_nocase = 0;
+
+    // If we don't have enough history, we don't need to do anything.
+    if (ll_table->maxLen <= ci->len + ci->hlen) {
+        u8 tempbuf[LONG_LIT_HASH_LEN];
+        const u8 *scan_buf = prepScanBuffer(ci, ll_table, tempbuf);
+
+        if (ll_table->caseful.hashBits &&
+            checkBloomFilter(ll_table, &ll_table->caseful, scan_buf, 0)) {
+            state_case = checkHashTable(ll_table, &ll_table->caseful, scan_buf,
+                                        scratch, 0);
+        }
+
+        if (ll_table->nocase.hashBits &&
+            checkBloomFilter(ll_table, &ll_table->nocase, scan_buf, 1)) {
+            state_nocase = checkHashTable(ll_table, &ll_table->nocase, scan_buf,
+                                          scratch, 1);
+        }
+    } else {
+        DEBUG_PRINTF("not enough history (%zu bytes)\n", ci->len + ci->hlen);
+    }
+
+    DEBUG_PRINTF("store {%u, %u}\n", state_case, state_nocase);
+
+    u8 *ll_state = getLongLitState(t, state);
+    storeLongLitStreamState(ll_table, ll_state, state_case, state_nocase);
+}
+
+#endif // STREAM_LONG_LIT_H
diff --git a/regex/rose/stream_long_lit_hash.h b/regex/rose/stream_long_lit_hash.h
new file mode 100644
index 000000000..041f05e60
--- /dev/null
+++ b/regex/rose/stream_long_lit_hash.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef STREAM_LONG_LIT_HASH_H
+#define STREAM_LONG_LIT_HASH_H
+
+#include "ue2common.h"
+#include "util/bitutils.h"
+#include "util/unaligned.h"
+
+/** \brief Length of the buffer operated on by \ref hashLongLiteral(). */
+#define LONG_LIT_HASH_LEN 24
+
+/** \brief Multiplier used by al the hash functions below. */
+#define HASH_MULTIPLIER 0x0b4e0ef37bc32127ULL
+
+/** \brief Hash function used for long literal table in streaming mode. */
+static really_inline
+u32 hashLongLiteral(const u8 *ptr, UNUSED size_t len, char nocase) {
+    // We unconditionally hash LONG_LIT_HASH_LEN bytes; all use cases of this
+    // hash are for strings longer than this.
+    assert(len >= 24);
+
+    u64a v1 = unaligned_load_u64a(ptr);
+    u64a v2 = unaligned_load_u64a(ptr + 8);
+    u64a v3 = unaligned_load_u64a(ptr + 16);
+    if (nocase) {
+        v1 &= OCTO_CASE_CLEAR;
+        v2 &= OCTO_CASE_CLEAR;
+        v3 &= OCTO_CASE_CLEAR;
+    }
+    v1 *= HASH_MULTIPLIER;
+    v2 *= HASH_MULTIPLIER * HASH_MULTIPLIER;
+    v3 *= HASH_MULTIPLIER * HASH_MULTIPLIER * HASH_MULTIPLIER;
+    v1 >>= 32;
+    v2 >>= 32;
+    v3 >>= 32;
+    return v1 ^ v2 ^ v3;
+}
+
+/**
+ * \brief Internal, used by the bloom filter hash functions below. Hashes 16
+ * bytes beginning at (ptr + offset).
+ */
+static really_inline
+u32 bloomHash_i(const u8 *ptr, u32 offset, u64a multiplier, char nocase) {
+    assert(offset + 16 <= LONG_LIT_HASH_LEN);
+
+    u64a v = unaligned_load_u64a(ptr + offset);
+    if (nocase) {
+        v &= OCTO_CASE_CLEAR;
+    }
+    v *= multiplier;
+    return v >> 32;
+}
+
+/*
+ * We ensure that we see every byte the first LONG_LIT_HASH_LEN bytes of input
+ * data (using at least one of the following functions).
+ */
+
+static really_inline
+u32 bloomHash_1(const u8 *ptr, char nocase) {
+    const u64a multiplier = HASH_MULTIPLIER;
+    return bloomHash_i(ptr, 0, multiplier, nocase);
+}
+
+static really_inline
+u32 bloomHash_2(const u8 *ptr, char nocase) {
+    const u64a multiplier = HASH_MULTIPLIER * HASH_MULTIPLIER;
+    return bloomHash_i(ptr, 4, multiplier, nocase);
+}
+
+static really_inline
+u32 bloomHash_3(const u8 *ptr, char nocase) {
+    const u64a multiplier = HASH_MULTIPLIER * HASH_MULTIPLIER * HASH_MULTIPLIER;
+    return bloomHash_i(ptr, 8, multiplier, nocase);
+}
+
+#endif // STREAM_LONG_LIT_HASH_H
diff --git a/regex/rose/validate_mask.h b/regex/rose/validate_mask.h
new file mode 100644
index 000000000..8191db52f
--- /dev/null
+++ b/regex/rose/validate_mask.h
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2016-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef VALIDATE_MASK_H
+#define VALIDATE_MASK_H
+
+#include "ue2common.h"
+#include "util/simd_utils.h"
+
+#if defined(DEBUG)
+static
+void validateMask32Print(const u8 *mask) {
+    int i;
+    for (i = 0; i < 32; i++) {
+        printf("%02x", mask[i]);
+    }
+    printf("\n");
+}
+
+#ifdef HAVE_AVX512
+static
+void validateMask64Print(const u8 *mask) {
+    int i;
+    for (i = 0; i < 64; i++) {
+        printf("%02x ", mask[i]);
+    }
+    printf("\n");
+}
+#endif
+#endif
+
+// check positive bytes in cmp_result.
+// return one if the check passed, zero otherwise.
+static really_inline
+int posValidateMask(const u64a cmp_result, const u64a pos_mask) {
+    return !(cmp_result & pos_mask);
+}
+
+/*
+ * check negative bytes in cmp_result.
+ * return one if any byte in cmp_result is not 0, zero otherwise.
+ * check lowest 7 bits and highest bit of every byte respectively.
+ */
+static really_inline
+int negValidateMask(const u64a cmp_result, const u64a neg_mask) {
+    const u64a count_mask = 0x7f7f7f7f7f7f7f7f;
+    // check lowest 7 bits of every byte.
+    // the highest bit should be 1 if check passed.
+    u64a check_low = (cmp_result & count_mask) + count_mask;
+    // check the highest bit of every byte.
+    // combine the highest bit and 0x7f to 0xff if check passes.
+    // flip all 0xff to 0x00 and 0x7f to 0x80.
+    u64a check_all = ~(check_low | cmp_result | count_mask);
+    return !(check_all & neg_mask);
+}
+
+static really_inline
+int validateMask(u64a data, u64a valid_data_mask, u64a and_mask,
+                 u64a cmp_mask, u64a neg_mask) {
+    // skip some byte where valid_data_mask is 0x00 there.
+    and_mask &= valid_data_mask;
+    cmp_mask &= valid_data_mask;
+    neg_mask &= valid_data_mask;
+    u64a cmp_result = (data & and_mask) ^ cmp_mask;
+    /* do the positive check first since it's cheaper */
+    if (posValidateMask(cmp_result, ~neg_mask)
+        && negValidateMask(cmp_result, neg_mask)) {
+        return 1;
+    } else {
+        DEBUG_PRINTF("data %llx valid_data_mask(vdm) %llx\n",
+                     data, valid_data_mask);
+        DEBUG_PRINTF("and_mask & vdm %llx cmp_mask & vdm %llx\n", and_mask,
+                     cmp_mask);
+        DEBUG_PRINTF("cmp_result %llx neg_mask & vdm %llx\n",
+                     cmp_result, neg_mask);
+        return 0;
+    }
+}
+
+static really_inline
+int validateMask32(const m256 data, const u32 valid_data_mask,
+                   const m256 and_mask, const m256 cmp_mask,
+                   const u32 neg_mask) {
+    m256 cmp_result_256 = eq256(and256(data, and_mask), cmp_mask);
+    u32 cmp_result = ~movemask256(cmp_result_256);
+#ifdef DEBUG
+    DEBUG_PRINTF("data\n");
+    validateMask32Print((const u8 *)&data);
+    DEBUG_PRINTF("cmp_result\n");
+    validateMask32Print((const u8 *)&cmp_result_256);
+#endif
+    DEBUG_PRINTF("cmp_result %08x neg_mask %08x\n", cmp_result, neg_mask);
+    DEBUG_PRINTF("valid_data_mask %08x\n", valid_data_mask);
+
+    if ((cmp_result & valid_data_mask) == (neg_mask & valid_data_mask)) {
+        DEBUG_PRINTF("checkCompareResult32 passed\n");
+        return 1;
+    } else {
+        DEBUG_PRINTF("checkCompareResult32 failed\n");
+        return 0;
+    }
+}
+
+#ifdef HAVE_AVX512
+static really_inline
+int validateMask64(const m512 data, const u64a valid_data_mask,
+                   const m512 and_mask, const m512 cmp_mask,
+                   const u64a neg_mask) {
+    u64a cmp_result = ~eq512mask(and512(data, and_mask), cmp_mask);
+#ifdef DEBUG
+    DEBUG_PRINTF("data\n");
+    validateMask64Print((const u8 *)&data);
+    DEBUG_PRINTF("cmp_result\n");
+    validateMask64Print((const u8 *)&cmp_result);
+#endif
+    DEBUG_PRINTF("cmp_result %016llx neg_mask %016llx\n", cmp_result, neg_mask);
+    DEBUG_PRINTF("valid_data_mask %016llx\n", valid_data_mask);
+
+    if ((cmp_result & valid_data_mask) == (neg_mask & valid_data_mask)) {
+        DEBUG_PRINTF("checkCompareResult64 passed\n");
+        return 1;
+    } else {
+        DEBUG_PRINTF("checkCompareResult64 failed\n");
+        return 0;
+    }
+}
+#endif
+
+#endif
diff --git a/regex/rose/validate_shufti.h b/regex/rose/validate_shufti.h
new file mode 100644
index 000000000..351df36a7
--- /dev/null
+++ b/regex/rose/validate_shufti.h
@@ -0,0 +1,372 @@
+/*
+ * Copyright (c) 2016-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef VALIDATE_SHUFTI_H
+#define VALIDATE_SHUFTI_H
+
+#include "ue2common.h"
+#include "util/simd_utils.h"
+
+#if defined(DEBUG)
+static
+void dumpMask(const void *mask, int len) {
+    const u8 *c = (const u8 *)mask;
+    for (int i = 0; i < len; i++) {
+        printf("%02x", c[i]);
+    }
+    printf("\n");
+}
+#endif
+
+static really_inline
+int validateShuftiMask16x16(const m256 data, const m256 hi_mask,
+                            const m256 lo_mask, const m256 and_mask,
+                            const u32 neg_mask, const u32 valid_data_mask) {
+    m256 low4bits = set32x8(0xf);
+    m256 c_lo = pshufb_m256(lo_mask, and256(data, low4bits));
+    m256 c_hi = pshufb_m256(hi_mask,
+                            rshift64_m256(andnot256(low4bits, data), 4));
+    m256 t = and256(c_lo, c_hi);
+    u32 nresult = movemask256(eq256(and256(t, and_mask), zeroes256()));
+#ifdef DEBUG
+    DEBUG_PRINTF("data\n");
+    dumpMask(&data, 32);
+    DEBUG_PRINTF("hi_mask\n");
+    dumpMask(&hi_mask, 32);
+    DEBUG_PRINTF("lo_mask\n");
+    dumpMask(&lo_mask, 32);
+    DEBUG_PRINTF("c_lo\n");
+    dumpMask(&c_lo, 32);
+    DEBUG_PRINTF("c_hi\n");
+    dumpMask(&c_hi, 32);
+    DEBUG_PRINTF("and_mask\n");
+    dumpMask(&and_mask, 32);
+    DEBUG_PRINTF("nresult %x\n", nresult);
+    DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask);
+#endif
+    u32 cmp_result = (((nresult >> 16) & nresult) ^ neg_mask) & valid_data_mask;
+    return !cmp_result;
+}
+
+static really_inline
+int validateShuftiMask16x8(const m128 data, const m256 nib_mask,
+                           const m128 and_mask, const u32 neg_mask,
+                           const u32 valid_data_mask) {
+    m256 data_m256 = combine2x128(rshift64_m128(data, 4), data);
+    m256 low4bits = set32x8(0xf);
+    m256 c_nib = pshufb_m256(nib_mask, and256(data_m256, low4bits));
+    m128 t = and128(movdq_hi(c_nib), movdq_lo(c_nib));
+    m128 nresult = eq128(and128(t, and_mask), zeroes128());
+#ifdef DEBUG
+    DEBUG_PRINTF("data\n");
+    dumpMask(&data_m256, 32);
+    DEBUG_PRINTF("nib_mask\n");
+    dumpMask(&nib_mask, 32);
+    DEBUG_PRINTF("c_nib\n");
+    dumpMask(&c_nib, 32);
+    DEBUG_PRINTF("nresult\n");
+    dumpMask(&nresult, 16);
+    DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask);
+#endif
+    u32 cmp_result = (movemask128(nresult) ^ neg_mask) & valid_data_mask;
+    return !cmp_result;
+}
+
+static really_inline
+int validateShuftiMask32x8(const m256 data, const m256 hi_mask,
+                           const m256 lo_mask, const m256 and_mask,
+                           const u32 neg_mask, const u32 valid_data_mask) {
+    m256 low4bits = set32x8(0xf);
+    m256 c_lo = pshufb_m256(lo_mask, and256(data, low4bits));
+    m256 c_hi = pshufb_m256(hi_mask,
+                            rshift64_m256(andnot256(low4bits, data), 4));
+    m256 t = and256(c_lo, c_hi);
+    m256 nresult = eq256(and256(t, and_mask), zeroes256());
+#ifdef DEBUG
+    DEBUG_PRINTF("data\n");
+    dumpMask(&data, 32);
+    DEBUG_PRINTF("hi_mask\n");
+    dumpMask(&hi_mask, 32);
+    DEBUG_PRINTF("lo_mask\n");
+    dumpMask(&lo_mask, 32);
+    DEBUG_PRINTF("c_lo\n");
+    dumpMask(&c_lo, 32);
+    DEBUG_PRINTF("c_hi\n");
+    dumpMask(&c_hi, 32);
+    DEBUG_PRINTF("nresult\n");
+    dumpMask(&nresult, 32);
+    DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask);
+#endif
+    u32 cmp_result = (movemask256(nresult) ^ neg_mask) & valid_data_mask;
+    return !cmp_result;
+}
+
+static really_inline
+int validateShuftiMask32x16(const m256 data,
+                            const m256 hi_mask_1, const m256 hi_mask_2,
+                            const m256 lo_mask_1, const m256 lo_mask_2,
+                            const m256 bucket_mask_hi,
+                            const m256 bucket_mask_lo, const u32 neg_mask,
+                            const u32 valid_data_mask) {
+    m256 low4bits = set32x8(0xf);
+    m256 data_lo = and256(data, low4bits);
+    m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
+    m256 c_lo_1 = pshufb_m256(lo_mask_1, data_lo);
+    m256 c_lo_2 = pshufb_m256(lo_mask_2, data_lo);
+    m256 c_hi_1 = pshufb_m256(hi_mask_1, data_hi);
+    m256 c_hi_2 = pshufb_m256(hi_mask_2, data_hi);
+    m256 t1 = and256(c_lo_1, c_hi_1);
+    m256 t2 = and256(c_lo_2, c_hi_2);
+    m256 result = or256(and256(t1, bucket_mask_lo), and256(t2, bucket_mask_hi));
+    u32 nresult = movemask256(eq256(result, zeroes256()));
+#ifdef DEBUG
+    DEBUG_PRINTF("data\n");
+    dumpMask(&data, 32);
+    DEBUG_PRINTF("data_lo\n");
+    dumpMask(&data_lo, 32);
+    DEBUG_PRINTF("data_hi\n");
+    dumpMask(&data_hi, 32);
+    DEBUG_PRINTF("hi_mask_1\n");
+    dumpMask(&hi_mask_1, 16);
+    DEBUG_PRINTF("hi_mask_2\n");
+    dumpMask(&hi_mask_2, 16);
+    DEBUG_PRINTF("lo_mask_1\n");
+    dumpMask(&lo_mask_1, 16);
+    DEBUG_PRINTF("lo_mask_2\n");
+    dumpMask(&lo_mask_2, 16);
+    DEBUG_PRINTF("c_lo_1\n");
+    dumpMask(&c_lo_1, 32);
+    DEBUG_PRINTF("c_lo_2\n");
+    dumpMask(&c_lo_2, 32);
+    DEBUG_PRINTF("c_hi_1\n");
+    dumpMask(&c_hi_1, 32);
+    DEBUG_PRINTF("c_hi_2\n");
+    dumpMask(&c_hi_2, 32);
+    DEBUG_PRINTF("result\n");
+    dumpMask(&result, 32);
+    DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask);
+#endif
+    u32 cmp_result = (nresult ^ neg_mask) & valid_data_mask;
+    return !cmp_result;
+}
+
+#ifdef HAVE_AVX512
+static really_inline
+int validateShuftiMask64x8(const m512 data, const m512 hi_mask,
+                           const m512 lo_mask, const m512 and_mask,
+                           const u64a neg_mask, const u64a valid_data_mask) {
+    m512 low4bits = set64x8(0xf);
+    m512 c_lo = pshufb_m512(lo_mask, and512(data, low4bits));
+    m512 c_hi = pshufb_m512(hi_mask,
+                            rshift64_m512(andnot512(low4bits, data), 4));
+    m512 t = and512(c_lo, c_hi);
+    u64a nresult = eq512mask(and512(t, and_mask), zeroes512());
+#ifdef DEBUG
+    DEBUG_PRINTF("data\n");
+    dumpMask(&data, 64);
+    DEBUG_PRINTF("hi_mask\n");
+    dumpMask(&hi_mask, 64);
+    DEBUG_PRINTF("lo_mask\n");
+    dumpMask(&lo_mask, 64);
+    DEBUG_PRINTF("c_lo\n");
+    dumpMask(&c_lo, 64);
+    DEBUG_PRINTF("c_hi\n");
+    dumpMask(&c_hi, 64);
+    DEBUG_PRINTF("nresult %llx\n", nresult);
+    DEBUG_PRINTF("valid_data_mask %llx\n", valid_data_mask);
+#endif
+    u64a cmp_result = (nresult ^ neg_mask) & valid_data_mask;
+    return !cmp_result;
+}
+
+static really_inline
+int validateShuftiMask64x16(const m512 data,
+                            const m512 hi_mask_1, const m512 hi_mask_2,
+                            const m512 lo_mask_1, const m512 lo_mask_2,
+                            const m512 and_mask_hi, const m512 and_mask_lo,
+                            const u64a neg_mask, const u64a valid_data_mask) {
+    m512 low4bits = set64x8(0xf);
+    m512 data_lo = and512(data, low4bits);
+    m512 data_hi = and512(rshift64_m512(data, 4), low4bits);
+    m512 c_lo_1 = pshufb_m512(lo_mask_1, data_lo);
+    m512 c_lo_2 = pshufb_m512(lo_mask_2, data_lo);
+    m512 c_hi_1 = pshufb_m512(hi_mask_1, data_hi);
+    m512 c_hi_2 = pshufb_m512(hi_mask_2, data_hi);
+    m512 t1 = and512(c_lo_1, c_hi_1);
+    m512 t2 = and512(c_lo_2, c_hi_2);
+    m512 result = or512(and512(t1, and_mask_lo), and512(t2, and_mask_hi));
+    u64a nresult = eq512mask(result, zeroes512());
+#ifdef DEBUG
+    DEBUG_PRINTF("data\n");
+    dumpMask(&data, 64);
+    DEBUG_PRINTF("data_lo\n");
+    dumpMask(&data_lo, 64);
+    DEBUG_PRINTF("data_hi\n");
+    dumpMask(&data_hi, 64);
+    DEBUG_PRINTF("hi_mask_1\n");
+    dumpMask(&hi_mask_1, 64);
+    DEBUG_PRINTF("hi_mask_2\n");
+    dumpMask(&hi_mask_2, 64);
+    DEBUG_PRINTF("lo_mask_1\n");
+    dumpMask(&lo_mask_1, 64);
+    DEBUG_PRINTF("lo_mask_2\n");
+    dumpMask(&lo_mask_2, 64);
+    DEBUG_PRINTF("c_lo_1\n");
+    dumpMask(&c_lo_1, 64);
+    DEBUG_PRINTF("c_lo_2\n");
+    dumpMask(&c_lo_2, 64);
+    DEBUG_PRINTF("c_hi_1\n");
+    dumpMask(&c_hi_1, 64);
+    DEBUG_PRINTF("c_hi_2\n");
+    dumpMask(&c_hi_2, 64);
+    DEBUG_PRINTF("result\n");
+    dumpMask(&result, 64);
+    DEBUG_PRINTF("valid_data_mask %llx\n", valid_data_mask);
+#endif
+    u64a cmp_result = (nresult ^ neg_mask) & valid_data_mask;
+    return !cmp_result;
+}
+#endif
+
+static really_inline
+int checkMultipath32(u32 data, u32 hi_bits, u32 lo_bits) {
+    u32 t = ~(data | hi_bits);
+    t += lo_bits;
+    t &= (~data) & hi_bits;
+    DEBUG_PRINTF("t %x\n", t);
+    return !!t;
+}
+
+static really_inline
+int checkMultipath64(u64a data, u64a hi_bits, u64a lo_bits) {
+    u64a t = ~(data | hi_bits);
+    t += lo_bits;
+    t &= (~data) & hi_bits;
+    DEBUG_PRINTF("t %llx\n", t);
+    return !!t;
+}
+
+static really_inline
+int validateMultipathShuftiMask16x8(const m128 data,
+                                    const m256 nib_mask,
+                                    const m128 bucket_select_mask,
+                                    const u32 hi_bits, const u32 lo_bits,
+                                    const u32 neg_mask,
+                                    const u32 valid_path_mask) {
+    m256 data_256 = combine2x128(rshift64_m128(data, 4), data);
+    m256 low4bits = set32x8(0xf);
+    m256 c_nib = pshufb_m256(nib_mask, and256(data_256, low4bits));
+    m128 t = and128(movdq_hi(c_nib), movdq_lo(c_nib));
+    m128 result = and128(t, bucket_select_mask);
+    u32 nresult = movemask128(eq128(result, zeroes128()));
+    u32 cmp_result = (nresult ^ neg_mask) | valid_path_mask;
+
+    DEBUG_PRINTF("cmp_result %x\n", cmp_result);
+
+    return checkMultipath32(cmp_result, hi_bits, lo_bits);
+}
+
+static really_inline
+int validateMultipathShuftiMask32x8(const m256 data,
+                                    const m256 hi_mask, const m256 lo_mask,
+                                    const m256 bucket_select_mask,
+                                    const u32 hi_bits, const u32 lo_bits,
+                                    const u32 neg_mask,
+                                    const u32 valid_path_mask) {
+    m256 low4bits = set32x8(0xf);
+    m256 data_lo = and256(data, low4bits);
+    m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
+    m256 c_lo = pshufb_m256(lo_mask, data_lo);
+    m256 c_hi = pshufb_m256(hi_mask, data_hi);
+    m256 c = and256(c_lo, c_hi);
+    m256 result = and256(c, bucket_select_mask);
+    u32 nresult = movemask256(eq256(result, zeroes256()));
+    u32 cmp_result = (nresult ^ neg_mask) | valid_path_mask;
+
+    DEBUG_PRINTF("cmp_result %x\n", cmp_result);
+
+    return checkMultipath32(cmp_result, hi_bits, lo_bits);
+}
+
+static really_inline
+int validateMultipathShuftiMask32x16(const m256 data,
+                                     const m256 hi_mask_1, const m256 hi_mask_2,
+                                     const m256 lo_mask_1, const m256 lo_mask_2,
+                                     const m256 bucket_select_mask_hi,
+                                     const m256 bucket_select_mask_lo,
+                                     const u32 hi_bits, const u32 lo_bits,
+                                     const u32 neg_mask,
+                                     const u32 valid_path_mask) {
+    m256 low4bits = set32x8(0xf);
+    m256 data_lo = and256(data, low4bits);
+    m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
+    m256 c_lo_1 = pshufb_m256(lo_mask_1, data_lo);
+    m256 c_lo_2 = pshufb_m256(lo_mask_2, data_lo);
+    m256 c_hi_1 = pshufb_m256(hi_mask_1, data_hi);
+    m256 c_hi_2 = pshufb_m256(hi_mask_2, data_hi);
+    m256 t1 = and256(c_lo_1, c_hi_1);
+    m256 t2 = and256(c_lo_2, c_hi_2);
+    m256 result = or256(and256(t1, bucket_select_mask_lo),
+                        and256(t2, bucket_select_mask_hi));
+    u32 nresult = movemask256(eq256(result, zeroes256()));
+    u32 cmp_result = (nresult ^ neg_mask) | valid_path_mask;
+
+    DEBUG_PRINTF("cmp_result %x\n", cmp_result);
+
+    return checkMultipath32(cmp_result, hi_bits, lo_bits);
+}
+
+static really_inline
+int validateMultipathShuftiMask64(const m256 data_1, const m256 data_2,
+                                  const m256 hi_mask, const m256 lo_mask,
+                                  const m256 bucket_select_mask_1,
+                                  const m256 bucket_select_mask_2,
+                                  const u64a hi_bits, const u64a lo_bits,
+                                  const u64a neg_mask,
+                                  const u64a valid_path_mask) {
+    m256 low4bits = set32x8(0xf);
+    m256 c_lo_1 = pshufb_m256(lo_mask, and256(data_1, low4bits));
+    m256 c_lo_2 = pshufb_m256(lo_mask, and256(data_2, low4bits));
+    m256 c_hi_1 = pshufb_m256(hi_mask,
+                              rshift64_m256(andnot256(low4bits, data_1), 4));
+    m256 c_hi_2 = pshufb_m256(hi_mask,
+                              rshift64_m256(andnot256(low4bits, data_2), 4));
+    m256 t1 = and256(c_lo_1, c_hi_1);
+    m256 t2 = and256(c_lo_2, c_hi_2);
+    m256 nresult_1 = eq256(and256(t1, bucket_select_mask_1), zeroes256());
+    m256 nresult_2 = eq256(and256(t2, bucket_select_mask_2), zeroes256());
+    u64a nresult = (u64a)movemask256(nresult_1) |
+                   (u64a)movemask256(nresult_2) << 32;
+    u64a cmp_result = (nresult ^ neg_mask) | valid_path_mask;
+
+    DEBUG_PRINTF("cmp_result %llx\n", cmp_result);
+
+    return checkMultipath64(cmp_result, hi_bits, lo_bits);
+}
+
+#endif
diff --git a/regex/runtime.c b/regex/runtime.c
new file mode 100644
index 000000000..b7c17320e
--- /dev/null
+++ b/regex/runtime.c
@@ -0,0 +1,1356 @@
+/*
+ * Copyright (c) 2015-2019, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Runtime functions.
+ */
+
+#ifndef __KERNEL__
+#include <stdlib.h>
+#include <string.h>
+#else
+#include <linux/types.h>
+#include <linux/string.h>
+#endif
+
+#include "allocator.h"
+#include "hs_compile.h" /* for HS_MODE_* flags */
+#include "hs_runtime.h"
+#include "hs_internal.h"
+#include "hwlm/hwlm.h"
+#include "nfa/mcclellan.h"
+#include "nfa/nfa_api.h"
+#include "nfa/nfa_api_util.h"
+#include "nfa/nfa_internal.h"
+#include "nfa/nfa_rev_api.h"
+#include "nfa/sheng.h"
+#include "smallwrite/smallwrite_internal.h"
+#include "rose/rose.h"
+#include "rose/runtime.h"
+#include "database.h"
+#include "report.h"
+#include "scratch.h"
+#include "som/som_runtime.h"
+#include "som/som_stream.h"
+#include "state.h"
+#include "stream_compress.h"
+#include "ue2common.h"
+#include "util/exhaust.h"
+#include "util/multibit.h"
+#include "fw/str.h"
+
+static really_inline
+void prefetch_data(const char *data, unsigned length) {
+    __builtin_prefetch(data);
+    __builtin_prefetch(data + length/2);
+    __builtin_prefetch(data + length - 24);
+}
+
+/** dummy event handler for use when user does not provide one */
+static
+int HS_CDECL null_onEvent(UNUSED unsigned id, UNUSED unsigned long long from,
+                          UNUSED unsigned long long to, UNUSED unsigned flags,
+                          UNUSED void *ctxt) {
+    return 0;
+}
+
+static really_inline
+u32 getHistoryAmount(const struct RoseEngine *t, u64a offset) {
+    return MIN(t->historyRequired, offset);
+}
+
+static really_inline
+u8 *getHistory(char *state, const struct RoseEngine *t, u64a offset) {
+    return (u8 *)state + t->stateOffsets.history + t->historyRequired
+        - MIN(t->historyRequired, offset);
+}
+
+/** \brief Sanity checks for scratch space.
+ *
+ * Although more at home in scratch.c, it is located here to be closer to its
+ * callers.
+ */
+static really_inline
+char validScratch(const struct RoseEngine *t, const struct hs_scratch *s) {
+    if (!ISALIGNED_CL(s)) {
+        DEBUG_PRINTF("bad alignment %p\n", s);
+        return 0;
+    }
+
+    if (s->magic != SCRATCH_MAGIC) {
+        DEBUG_PRINTF("bad magic 0x%x\n", s->magic);
+        return 0;
+    }
+
+    if (t->mode == HS_MODE_BLOCK && t->stateOffsets.end > s->bStateSize) {
+        DEBUG_PRINTF("bad state size\n");
+        return 0;
+    }
+
+    if (t->queueCount > s->queueCount) {
+        DEBUG_PRINTF("bad queue count\n");
+        return 0;
+    }
+
+    /* TODO: add quick rose sanity checks */
+
+    return 1;
+}
+
+static really_inline
+void populateCoreInfo(struct hs_scratch *s, const struct RoseEngine *rose,
+                      char *state, match_event_handler onEvent, void *userCtx,
+                      const char *data, size_t length, const u8 *history,
+                      size_t hlen, u64a offset, u8 status,
+                      UNUSED unsigned int flags) {
+    assert(rose);
+    s->core_info.userContext = userCtx;
+    s->core_info.userCallback = onEvent ? onEvent : null_onEvent;
+    s->core_info.rose = rose;
+    s->core_info.state = state; /* required for chained queues + evec */
+
+    s->core_info.exhaustionVector = state + rose->stateOffsets.exhausted;
+    s->core_info.status = status;
+    s->core_info.buf = (const u8 *)data;
+    s->core_info.len = length;
+    s->core_info.hbuf = history;
+    s->core_info.hlen = hlen;
+    s->core_info.buf_offset = offset;
+
+    /* and some stuff not actually in core info */
+    s->som_set_now_offset = ~0ULL;
+    s->deduper.current_report_offset = ~0ULL;
+    s->deduper.som_log_dirty = 1; /* som logs have not been cleared */
+    s->fdr_conf = NULL;
+
+    // Rose program execution (used for some report paths) depends on these
+    // values being initialised.
+    s->tctxt.lastMatchOffset = 0;
+    s->tctxt.minMatchOffset = offset;
+    s->tctxt.minNonMpvMatchOffset = offset;
+}
+
+#define STATUS_VALID_BITS                                                      \
+    (STATUS_TERMINATED | STATUS_EXHAUSTED | STATUS_DELAY_DIRTY | STATUS_ERROR)
+
+/** \brief Retrieve status bitmask from stream state. */
+static really_inline
+u8 getStreamStatus(const char *state) {
+    u8 status = *(const u8 *)(state + ROSE_STATE_OFFSET_STATUS_FLAGS);
+    assert((status & ~STATUS_VALID_BITS) == 0);
+    return status;
+}
+
+/** \brief Store status bitmask to stream state. */
+static really_inline
+void setStreamStatus(char *state, u8 status) {
+    assert((status & ~STATUS_VALID_BITS) == 0);
+    *(u8 *)(state + ROSE_STATE_OFFSET_STATUS_FLAGS) = status;
+}
+
+/** \brief Initialise SOM state. Used in both block and streaming mode. */
+static really_inline
+void initSomState(const struct RoseEngine *rose, char *state) {
+    assert(rose && state);
+    const u32 somCount = rose->somLocationCount;
+    mmbit_clear((u8 *)state + rose->stateOffsets.somValid, somCount);
+    mmbit_clear((u8 *)state + rose->stateOffsets.somWritable, somCount);
+}
+
+static really_inline
+void rawBlockExec(const struct RoseEngine *rose, struct hs_scratch *scratch) {
+    assert(rose);
+    assert(scratch);
+
+    initSomState(rose, scratch->core_info.state);
+
+    DEBUG_PRINTF("blockmode scan len=%zu\n", scratch->core_info.len);
+
+    roseBlockExec(rose, scratch);
+}
+
+static really_inline
+void pureLiteralInitScratch(struct hs_scratch *scratch, u64a offset) {
+    // Some init has already been done.
+    assert(offset == scratch->core_info.buf_offset);
+
+    scratch->tctxt.lit_offset_adjust = offset + 1;
+    scratch->tctxt.lastEndOffset = offset;
+    scratch->tctxt.delayLastEndOffset = offset;
+    scratch->tctxt.filledDelayedSlots = 0;
+    scratch->al_log_sum = 0;
+}
+
+
+static really_inline
+void pureLiteralBlockExec(const struct RoseEngine *rose,
+                          struct hs_scratch *scratch) {
+    assert(rose);
+    assert(scratch);
+
+    const struct HWLM *ftable = getFLiteralMatcher(rose);
+
+    initSomState(rose, scratch->core_info.state);
+    const u8 *buffer = scratch->core_info.buf;
+    size_t length = scratch->core_info.len;
+    DEBUG_PRINTF("rose engine %d\n", rose->runtimeImpl);
+
+    pureLiteralInitScratch(scratch, 0);
+    scratch->tctxt.groups = rose->initialGroups;
+
+    hwlmExec(ftable, buffer, length, 0, roseCallback, scratch,
+             rose->initialGroups & rose->floating_group_mask);
+}
+
+static really_inline
+void initOutfixQueue(struct mq *q, u32 qi, const struct RoseEngine *t,
+                     struct hs_scratch *scratch) {
+    const struct NfaInfo *info = getNfaInfoByQueue(t, qi);
+    q->nfa = getNfaByInfo(t, info);
+    q->end = 0;
+    q->cur = 0;
+    q->state = scratch->fullState + info->fullStateOffset;
+    q->streamState = (char *)scratch->core_info.state + info->stateOffset;
+    q->offset = scratch->core_info.buf_offset;
+    q->buffer = scratch->core_info.buf;
+    q->length = scratch->core_info.len;
+    q->history = scratch->core_info.hbuf;
+    q->hlength = scratch->core_info.hlen;
+    q->cb = roseReportAdaptor;
+    q->context = scratch;
+    q->report_current = 0;
+
+    DEBUG_PRINTF("qi=%u, offset=%llu, fullState=%u, streamState=%u, "
+                 "state=%u\n", qi, q->offset, info->fullStateOffset,
+                 info->stateOffset, *(u32 *)q->state);
+}
+
+static never_inline
+void soleOutfixBlockExec(const struct RoseEngine *t,
+                         struct hs_scratch *scratch) {
+    assert(t);
+    assert(scratch);
+
+    initSomState(t, scratch->core_info.state);
+    assert(t->outfixEndQueue == 1);
+    assert(!t->amatcherOffset);
+    assert(!t->ematcherOffset);
+    assert(!t->fmatcherOffset);
+
+    const struct NFA *nfa = getNfaByQueue(t, 0);
+
+    size_t len = nfaRevAccelCheck(nfa, scratch->core_info.buf,
+                                  scratch->core_info.len);
+    if (!len) {
+        return;
+    }
+
+    struct mq *q = scratch->queues;
+    initOutfixQueue(q, 0, t, scratch);
+    q->length = len; /* adjust for rev_accel */
+    nfaQueueInitState(nfa, q);
+    pushQueueAt(q, 0, MQE_START, 0);
+    pushQueueAt(q, 1, MQE_TOP, 0);
+    pushQueueAt(q, 2, MQE_END, scratch->core_info.len);
+
+    char rv = nfaQueueExec(q->nfa, q, scratch->core_info.len);
+
+    if (rv && nfaAcceptsEod(nfa) && len == scratch->core_info.len) {
+        nfaCheckFinalState(nfa, q->state, q->streamState, q->length, q->cb,
+                           scratch);
+    }
+}
+
+static rose_inline
+void runSmallWriteEngine(const struct SmallWriteEngine *smwr,
+                         struct hs_scratch *scratch) {
+    assert(smwr);
+    assert(scratch);
+
+    const u8 *buffer = scratch->core_info.buf;
+    size_t length = scratch->core_info.len;
+
+    DEBUG_PRINTF("USING SMALL WRITE\n");
+
+    if (length <= smwr->start_offset) {
+        DEBUG_PRINTF("too short\n");
+        return;
+    }
+
+    const struct NFA *nfa = getSmwrNfa(smwr);
+
+    size_t local_alen = length - smwr->start_offset;
+    const u8 *local_buffer = buffer + smwr->start_offset;
+
+    assert(isDfaType(nfa->type));
+    if (nfa->type == MCCLELLAN_NFA_8) {
+        nfaExecMcClellan8_B(nfa, smwr->start_offset, local_buffer,
+                            local_alen, roseReportAdaptor, scratch);
+    } else if (nfa->type == MCCLELLAN_NFA_16) {
+        nfaExecMcClellan16_B(nfa, smwr->start_offset, local_buffer,
+                             local_alen, roseReportAdaptor, scratch);
+    } else {
+        nfaExecSheng_B(nfa, smwr->start_offset, local_buffer,
+                       local_alen, roseReportAdaptor, scratch);
+    }
+}
+
+HS_PUBLIC_API
+hs_error_t HS_CDECL hs_scan(const hs_database_t *db, const char *data,
+                            unsigned length, unsigned flags,
+                            hs_scratch_t *scratch, match_event_handler onEvent,
+                            void *userCtx) {
+    if (unlikely(!scratch || !data)) {
+        return HS_INVALID;
+    }
+
+    hs_error_t err = validDatabase(db);
+    if (unlikely(err != HS_SUCCESS)) {
+        return err;
+    }
+
+    const struct RoseEngine *rose = hs_get_bytecode(db);
+    if (unlikely(!ISALIGNED_16(rose))) {
+        return HS_INVALID;
+    }
+
+    if (unlikely(rose->mode != HS_MODE_BLOCK)) {
+        return HS_DB_MODE_ERROR;
+    }
+
+    if (unlikely(!validScratch(rose, scratch))) {
+        return HS_INVALID;
+    }
+
+    if (unlikely(markScratchInUse(scratch))) {
+        return HS_SCRATCH_IN_USE;
+    }
+
+    if (rose->minWidth > length) {
+        DEBUG_PRINTF("minwidth=%u > length=%u\n", rose->minWidth, length);
+        unmarkScratchInUse(scratch);
+        return HS_SUCCESS;
+    }
+
+    prefetch_data(data, length);
+
+    /* populate core info in scratch */
+    populateCoreInfo(scratch, rose, scratch->bstate, onEvent, userCtx, data,
+                     length, NULL, 0, 0, 0, flags);
+
+    clearEvec(rose, scratch->core_info.exhaustionVector);
+    if (rose->ckeyCount) {
+        scratch->core_info.logicalVector = scratch->bstate +
+                                           rose->stateOffsets.logicalVec;
+        scratch->core_info.combVector = scratch->bstate +
+                                        rose->stateOffsets.combVec;
+        scratch->tctxt.lastCombMatchOffset = 0;
+        clearLvec(rose, scratch->core_info.logicalVector,
+                  scratch->core_info.combVector);
+    }
+
+    if (!length) {
+        if (rose->boundary.reportZeroEodOffset) {
+            roseRunBoundaryProgram(rose, rose->boundary.reportZeroEodOffset, 0,
+                                   scratch);
+        }
+        goto set_retval;
+    }
+
+    if (rose->boundary.reportZeroOffset) {
+        int rv = roseRunBoundaryProgram(rose, rose->boundary.reportZeroOffset,
+                                        0, scratch);
+        if (rv == MO_HALT_MATCHING) {
+            goto set_retval;
+        }
+    }
+
+    if (rose->minWidthExcludingBoundaries > length) {
+        DEBUG_PRINTF("minWidthExcludingBoundaries=%u > length=%u\n",
+                     rose->minWidthExcludingBoundaries, length);
+        goto done_scan;
+    }
+
+    // Similarly, we may have a maximum width (for engines constructed entirely
+    // of bi-anchored patterns).
+    if (rose->maxBiAnchoredWidth != ROSE_BOUND_INF
+        && length > rose->maxBiAnchoredWidth) {
+        DEBUG_PRINTF("block len=%u longer than maxBAWidth=%u\n", length,
+                     rose->maxBiAnchoredWidth);
+        goto done_scan;
+    }
+
+    // Is this a small write case?
+    if (rose->smallWriteOffset) {
+        const struct SmallWriteEngine *smwr = getSmallWrite(rose);
+        assert(smwr);
+
+        // Apply the small write engine if and only if the block (buffer) is
+        // small enough. Otherwise, we allow rose &co to deal with it.
+        if (length < smwr->largestBuffer) {
+            DEBUG_PRINTF("Attempting small write of block %u bytes long.\n",
+                         length);
+            runSmallWriteEngine(smwr, scratch);
+            goto done_scan;
+        }
+    }
+
+    switch (rose->runtimeImpl) {
+    case ROSE_RUNTIME_FULL_ROSE:
+        rawBlockExec(rose, scratch);
+        break;
+    case ROSE_RUNTIME_PURE_LITERAL:
+        pureLiteralBlockExec(rose, scratch);
+        break;
+    case ROSE_RUNTIME_SINGLE_OUTFIX:
+        soleOutfixBlockExec(rose, scratch);
+        break;
+    default:
+        assert(0);
+    }
+
+done_scan:
+    if (unlikely(internal_matching_error(scratch))) {
+        unmarkScratchInUse(scratch);
+        return HS_UNKNOWN_ERROR;
+    } else if (told_to_stop_matching(scratch)) {
+        unmarkScratchInUse(scratch);
+        return HS_SCAN_TERMINATED;
+    }
+
+    if (rose->hasSom) {
+        int halt = flushStoredSomMatches(scratch, ~0ULL);
+        if (halt) {
+            unmarkScratchInUse(scratch);
+            return HS_SCAN_TERMINATED;
+        }
+    }
+
+    if (rose->boundary.reportEodOffset) {
+        roseRunBoundaryProgram(rose, rose->boundary.reportEodOffset, length,
+                               scratch);
+    }
+
+set_retval:
+    if (unlikely(internal_matching_error(scratch))) {
+        unmarkScratchInUse(scratch);
+        return HS_UNKNOWN_ERROR;
+    }
+
+    if (rose->lastFlushCombProgramOffset) {
+        if (roseRunLastFlushCombProgram(rose, scratch, length)
+            == MO_HALT_MATCHING) {
+            if (unlikely(internal_matching_error(scratch))) {
+                unmarkScratchInUse(scratch);
+                return HS_UNKNOWN_ERROR;
+            }
+            unmarkScratchInUse(scratch);
+            return HS_SCAN_TERMINATED;
+        }
+    }
+
+    DEBUG_PRINTF("done. told_to_stop_matching=%d\n",
+                 told_to_stop_matching(scratch));
+    hs_error_t rv = told_to_stop_matching(scratch) ? HS_SCAN_TERMINATED
+                                                   : HS_SUCCESS;
+    unmarkScratchInUse(scratch);
+    return rv;
+}
+
+static really_inline
+void maintainHistoryBuffer(const struct RoseEngine *rose, char *state,
+                           const char *buffer, size_t length) {
+    if (!rose->historyRequired) {
+        return;
+    }
+
+    // Hopefully few of our users are scanning no data.
+    if (unlikely(length == 0)) {
+        DEBUG_PRINTF("zero-byte scan\n");
+        return;
+    }
+
+    char *his_state = state + rose->stateOffsets.history;
+
+    if (length < rose->historyRequired) {
+        size_t shortfall = rose->historyRequired - length;
+        memmove(his_state, his_state + rose->historyRequired - shortfall,
+                shortfall);
+    }
+    size_t amount = MIN(rose->historyRequired, length);
+
+    memcpy(his_state + rose->historyRequired - amount, buffer + length - amount,
+           amount);
+#ifdef DEBUG_HISTORY
+    printf("History [%u] : ", rose->historyRequired);
+    for (size_t i = 0; i < rose->historyRequired; i++) {
+        printf(" %02hhx", his_state[i]);
+    }
+    printf("\n");
+#endif
+}
+
+static really_inline
+void init_stream(struct hs_stream *s, const struct RoseEngine *rose,
+                 char init_history) {
+    char *state = getMultiState(s);
+
+    if (init_history) {
+        // Make absolutely sure that the 16 bytes leading up to the end of the
+        // history buffer are initialised, as we rely on this (regardless of the
+        // actual values used) in FDR.
+        char *hist_end =
+            state + rose->stateOffsets.history + rose->historyRequired;
+        assert(hist_end - 16 >= (const char *)s);
+        memset(hist_end - 16, 0x5a, 16);
+    }
+
+    s->rose = rose;
+    s->offset = 0;
+
+    setStreamStatus(state, 0);
+    roseInitState(rose, state);
+
+    clearEvec(rose, state + rose->stateOffsets.exhausted);
+    if (rose->ckeyCount) {
+        clearLvec(rose, state + rose->stateOffsets.logicalVec,
+                  state + rose->stateOffsets.combVec);
+    }
+
+    // SOM state multibit structures.
+    initSomState(rose, state);
+}
+
+HS_PUBLIC_API
+hs_error_t HS_CDECL hs_open_stream(const hs_database_t *db,
+                                   UNUSED unsigned flags,
+                                   hs_stream_t **stream) {
+    if (unlikely(!stream)) {
+        return HS_INVALID;
+    }
+
+    *stream = NULL;
+
+    hs_error_t err = validDatabase(db);
+    if (unlikely(err != HS_SUCCESS)) {
+        return err;
+    }
+
+    const struct RoseEngine *rose = hs_get_bytecode(db);
+    if (unlikely(!ISALIGNED_16(rose))) {
+        return HS_INVALID;
+    }
+
+    if (unlikely(rose->mode != HS_MODE_STREAM)) {
+        return HS_DB_MODE_ERROR;
+    }
+
+    size_t stateSize = rose->stateOffsets.end;
+    struct hs_stream *s = hs_stream_alloc(sizeof(struct hs_stream) + stateSize);
+    if (unlikely(!s)) {
+        return HS_NOMEM;
+    }
+
+    init_stream(s, rose, 1);
+
+    *stream = s;
+    return HS_SUCCESS;
+}
+
+
+static really_inline
+void rawEodExec(hs_stream_t *id, hs_scratch_t *scratch) {
+    const struct RoseEngine *rose = id->rose;
+
+    if (can_stop_matching(scratch)) {
+        DEBUG_PRINTF("stream already broken\n");
+        return;
+    }
+
+    if (isAllExhausted(rose, scratch->core_info.exhaustionVector)) {
+        DEBUG_PRINTF("stream exhausted\n");
+        return;
+    }
+
+    roseStreamEodExec(rose, id->offset, scratch);
+}
+
+static never_inline
+void soleOutfixEodExec(hs_stream_t *id, hs_scratch_t *scratch) {
+    const struct RoseEngine *t = id->rose;
+
+    if (can_stop_matching(scratch)) {
+        DEBUG_PRINTF("stream already broken\n");
+        return;
+    }
+
+    if (isAllExhausted(t, scratch->core_info.exhaustionVector)) {
+        DEBUG_PRINTF("stream exhausted\n");
+        return;
+    }
+
+    assert(t->outfixEndQueue == 1);
+    assert(!t->amatcherOffset);
+    assert(!t->ematcherOffset);
+    assert(!t->fmatcherOffset);
+
+    const struct NFA *nfa = getNfaByQueue(t, 0);
+
+    struct mq *q = scratch->queues;
+    initOutfixQueue(q, 0, t, scratch);
+    if (!scratch->core_info.buf_offset) {
+        DEBUG_PRINTF("buf_offset is zero\n");
+        return; /* no vacuous engines */
+    }
+
+    nfaExpandState(nfa, q->state, q->streamState, q->offset,
+                   queue_prev_byte(q, 0));
+
+    assert(nfaAcceptsEod(nfa));
+    nfaCheckFinalState(nfa, q->state, q->streamState, q->offset, q->cb,
+                       scratch);
+}
+
+static really_inline
+void report_eod_matches(hs_stream_t *id, hs_scratch_t *scratch,
+                        match_event_handler onEvent, void *context) {
+    DEBUG_PRINTF("--- report eod matches at offset %llu\n", id->offset);
+    assert(onEvent);
+
+    const struct RoseEngine *rose = id->rose;
+    char *state = getMultiState(id);
+    u8 status = getStreamStatus(state);
+
+    if (status & (STATUS_TERMINATED | STATUS_EXHAUSTED | STATUS_ERROR)) {
+        DEBUG_PRINTF("stream is broken, just freeing storage\n");
+        return;
+    }
+
+    populateCoreInfo(scratch, rose, state, onEvent, context, NULL, 0,
+                     getHistory(state, rose, id->offset),
+                     getHistoryAmount(rose, id->offset), id->offset, status, 0);
+
+    if (rose->ckeyCount) {
+        scratch->core_info.logicalVector = state +
+                                           rose->stateOffsets.logicalVec;
+        scratch->core_info.combVector = state + rose->stateOffsets.combVec;
+        if (!id->offset) {
+            scratch->tctxt.lastCombMatchOffset = id->offset;
+        }
+    }
+
+    if (rose->somLocationCount) {
+        loadSomFromStream(scratch, id->offset);
+    }
+
+    if (!id->offset) {
+        if (rose->boundary.reportZeroEodOffset) {
+            int rv = roseRunBoundaryProgram(
+                rose, rose->boundary.reportZeroEodOffset, 0, scratch);
+            if (rv == MO_HALT_MATCHING) {
+                return;
+            }
+        }
+    } else {
+        if (rose->boundary.reportEodOffset) {
+            int rv = roseRunBoundaryProgram(
+                rose, rose->boundary.reportEodOffset, id->offset, scratch);
+            if (rv == MO_HALT_MATCHING) {
+                return;
+            }
+        }
+
+        if (rose->requiresEodCheck) {
+            switch (rose->runtimeImpl) {
+            default:
+            case ROSE_RUNTIME_FULL_ROSE:
+                rawEodExec(id, scratch);
+                break;
+            case ROSE_RUNTIME_SINGLE_OUTFIX:
+                soleOutfixEodExec(id, scratch);
+                break;
+            case ROSE_RUNTIME_PURE_LITERAL:
+                assert(0);
+            }
+        }
+    }
+
+    if (rose->hasSom && !told_to_stop_matching(scratch)) {
+        int halt = flushStoredSomMatches(scratch, ~0ULL);
+        if (halt) {
+            DEBUG_PRINTF("told to stop matching\n");
+            scratch->core_info.status |= STATUS_TERMINATED;
+        }
+    }
+
+    if (rose->lastFlushCombProgramOffset && !told_to_stop_matching(scratch)) {
+        if (roseRunLastFlushCombProgram(rose, scratch, id->offset)
+            == MO_HALT_MATCHING) {
+            DEBUG_PRINTF("told to stop matching\n");
+            scratch->core_info.status |= STATUS_TERMINATED;
+        }
+    }
+}
+
+HS_PUBLIC_API
+hs_error_t HS_CDECL hs_copy_stream(hs_stream_t **to_id,
+                                   const hs_stream_t *from_id) {
+    if (!to_id) {
+        return HS_INVALID;
+    }
+
+    *to_id = NULL;
+
+    if (!from_id || !from_id->rose) {
+        return HS_INVALID;
+    }
+
+    const struct RoseEngine *rose = from_id->rose;
+    size_t stateSize = sizeof(struct hs_stream) + rose->stateOffsets.end;
+
+    struct hs_stream *s = hs_stream_alloc(stateSize);
+    if (!s) {
+        return HS_NOMEM;
+    }
+
+    memcpy(s, from_id, stateSize);
+
+    *to_id = s;
+
+    return HS_SUCCESS;
+}
+
+HS_PUBLIC_API
+hs_error_t HS_CDECL hs_reset_and_copy_stream(hs_stream_t *to_id,
+                                             const hs_stream_t *from_id,
+                                             hs_scratch_t *scratch,
+                                             match_event_handler onEvent,
+                                             void *context) {
+    if (!from_id || !from_id->rose) {
+        return HS_INVALID;
+    }
+
+    if (!to_id || to_id->rose != from_id->rose) {
+        return HS_INVALID;
+    }
+
+    if (to_id == from_id) {
+        return HS_INVALID;
+    }
+
+    if (onEvent) {
+        if (!scratch || !validScratch(to_id->rose, scratch)) {
+            return HS_INVALID;
+        }
+        if (unlikely(markScratchInUse(scratch))) {
+            return HS_SCRATCH_IN_USE;
+        }
+        report_eod_matches(to_id, scratch, onEvent, context);
+        if (unlikely(internal_matching_error(scratch))) {
+            unmarkScratchInUse(scratch);
+            return HS_UNKNOWN_ERROR;
+        }
+        unmarkScratchInUse(scratch);
+    }
+
+    size_t stateSize
+        = sizeof(struct hs_stream) + from_id->rose->stateOffsets.end;
+
+    memcpy(to_id, from_id, stateSize);
+
+    return HS_SUCCESS;
+}
+
+static really_inline
+void rawStreamExec(struct hs_stream *stream_state, struct hs_scratch *scratch) {
+    assert(stream_state);
+    assert(scratch);
+    assert(!can_stop_matching(scratch));
+
+    DEBUG_PRINTF("::: streaming rose ::: offset = %llu len = %zu\n",
+                 stream_state->offset, scratch->core_info.len);
+
+    const struct RoseEngine *rose = stream_state->rose;
+    assert(rose);
+    roseStreamExec(rose, scratch);
+
+    if (!told_to_stop_matching(scratch) &&
+        isAllExhausted(rose, scratch->core_info.exhaustionVector)) {
+        DEBUG_PRINTF("stream exhausted\n");
+        scratch->core_info.status |= STATUS_EXHAUSTED;
+    }
+}
+
+static really_inline
+void pureLiteralStreamExec(struct hs_stream *stream_state,
+                           struct hs_scratch *scratch) {
+    assert(stream_state);
+    assert(scratch);
+    assert(!can_stop_matching(scratch));
+
+    const struct RoseEngine *rose = stream_state->rose;
+    const struct HWLM *ftable = getFLiteralMatcher(rose);
+
+    size_t len2 = scratch->core_info.len;
+
+    DEBUG_PRINTF("::: streaming rose ::: offset = %llu len = %zu\n",
+                 stream_state->offset, scratch->core_info.len);
+
+    pureLiteralInitScratch(scratch, stream_state->offset);
+    scratch->tctxt.groups = loadGroups(rose, scratch->core_info.state);
+
+    // Pure literal cases don't have floatingMinDistance set, so we always
+    // start the match region at zero.
+    const size_t start = 0;
+
+    hwlmExecStreaming(ftable, len2, start, roseCallback, scratch,
+                      rose->initialGroups & rose->floating_group_mask);
+
+    if (!told_to_stop_matching(scratch) &&
+        isAllExhausted(rose, scratch->core_info.exhaustionVector)) {
+        DEBUG_PRINTF("stream exhausted\n");
+        scratch->core_info.status |= STATUS_EXHAUSTED;
+    }
+}
+
+static never_inline
+void soleOutfixStreamExec(struct hs_stream *stream_state,
+                          struct hs_scratch *scratch) {
+    assert(stream_state);
+    assert(scratch);
+    assert(!can_stop_matching(scratch));
+
+    const struct RoseEngine *t = stream_state->rose;
+    assert(t->outfixEndQueue == 1);
+    assert(!t->amatcherOffset);
+    assert(!t->ematcherOffset);
+    assert(!t->fmatcherOffset);
+
+    const struct NFA *nfa = getNfaByQueue(t, 0);
+
+    struct mq *q = scratch->queues;
+    initOutfixQueue(q, 0, t, scratch);
+    if (!scratch->core_info.buf_offset) {
+        nfaQueueInitState(nfa, q);
+        pushQueueAt(q, 0, MQE_START, 0);
+        pushQueueAt(q, 1, MQE_TOP, 0);
+        pushQueueAt(q, 2, MQE_END, scratch->core_info.len);
+    } else {
+        nfaExpandState(nfa, q->state, q->streamState, q->offset,
+                       queue_prev_byte(q, 0));
+        pushQueueAt(q, 0, MQE_START, 0);
+        pushQueueAt(q, 1, MQE_END, scratch->core_info.len);
+    }
+
+    if (nfaQueueExec(q->nfa, q, scratch->core_info.len)) {
+        nfaQueueCompressState(nfa, q, scratch->core_info.len);
+    } else if (!told_to_stop_matching(scratch)) {
+        scratch->core_info.status |= STATUS_EXHAUSTED;
+    }
+}
+
+static inline
+hs_error_t hs_scan_stream_internal(hs_stream_t *id, const char *data,
+                                   unsigned length, UNUSED unsigned flags,
+                                   hs_scratch_t *scratch,
+                                   match_event_handler onEvent, void *context) {
+    assert(id);
+    assert(scratch);
+
+    if (unlikely(!data)) {
+        return HS_INVALID;
+    }
+
+    const struct RoseEngine *rose = id->rose;
+    char *state = getMultiState(id);
+
+    u8 status = getStreamStatus(state);
+    if (status & (STATUS_TERMINATED | STATUS_EXHAUSTED | STATUS_ERROR)) {
+        DEBUG_PRINTF("stream is broken, halting scan\n");
+        if (status & STATUS_ERROR) {
+            return HS_UNKNOWN_ERROR;
+        } else if (status & STATUS_TERMINATED) {
+            return HS_SCAN_TERMINATED;
+        } else {
+            return HS_SUCCESS;
+        }
+    }
+
+    // We avoid doing any work if the user has given us zero bytes of data to
+    // scan. Arguably we should define some semantics for how we treat vacuous
+    // cases here.
+    if (unlikely(length == 0)) {
+        DEBUG_PRINTF("zero length block\n");
+        return HS_SUCCESS;
+    }
+
+    u32 historyAmount = getHistoryAmount(rose, id->offset);
+    populateCoreInfo(scratch, rose, state, onEvent, context, data, length,
+                     getHistory(state, rose, id->offset), historyAmount,
+                     id->offset, status, flags);
+    if (rose->ckeyCount) {
+        scratch->core_info.logicalVector = state +
+                                           rose->stateOffsets.logicalVec;
+        scratch->core_info.combVector = state + rose->stateOffsets.combVec;
+        if (!id->offset) {
+            scratch->tctxt.lastCombMatchOffset = id->offset;
+        }
+    }
+    assert(scratch->core_info.hlen <= id->offset
+           && scratch->core_info.hlen <= rose->historyRequired);
+
+    prefetch_data(data, length);
+
+    if (rose->somLocationCount) {
+        loadSomFromStream(scratch, id->offset);
+    }
+
+    if (!id->offset && rose->boundary.reportZeroOffset) {
+        DEBUG_PRINTF("zero reports\n");
+        int rv = roseRunBoundaryProgram(rose, rose->boundary.reportZeroOffset,
+                                        0, scratch);
+        if (rv == MO_HALT_MATCHING) {
+            DEBUG_PRINTF("halting scan\n");
+            setStreamStatus(state, scratch->core_info.status);
+            if (told_to_stop_matching(scratch)) {
+                return HS_SCAN_TERMINATED;
+            } else {
+                assert(scratch->core_info.status & STATUS_EXHAUSTED);
+                return HS_SUCCESS;
+            }
+        }
+    }
+
+    switch (rose->runtimeImpl) {
+    case ROSE_RUNTIME_FULL_ROSE:
+        rawStreamExec(id, scratch);
+        break;
+    case ROSE_RUNTIME_PURE_LITERAL:
+        pureLiteralStreamExec(id, scratch);
+        break;
+    case ROSE_RUNTIME_SINGLE_OUTFIX:
+        soleOutfixStreamExec(id, scratch);
+        break;
+    default:
+        assert(0);
+    }
+
+    if (rose->hasSom && !told_to_stop_matching(scratch)) {
+        int halt = flushStoredSomMatches(scratch, ~0ULL);
+        if (halt) {
+            scratch->core_info.status |= STATUS_TERMINATED;
+        }
+    }
+
+    setStreamStatus(state, scratch->core_info.status);
+
+    if (unlikely(internal_matching_error(scratch))) {
+        return HS_UNKNOWN_ERROR;
+    } else if (likely(!can_stop_matching(scratch))) {
+        maintainHistoryBuffer(rose, state, data, length);
+        id->offset += length; /* maintain offset */
+
+        if (rose->somLocationCount) {
+            storeSomToStream(scratch, id->offset);
+        }
+    } else if (told_to_stop_matching(scratch)) {
+        return HS_SCAN_TERMINATED;
+    }
+
+    return HS_SUCCESS;
+}
+
+HS_PUBLIC_API
+hs_error_t HS_CDECL hs_scan_stream(hs_stream_t *id, const char *data,
+                                   unsigned length, unsigned flags,
+                                   hs_scratch_t *scratch,
+                                   match_event_handler onEvent, void *context) {
+    if (unlikely(!id || !scratch || !data ||
+                 !validScratch(id->rose, scratch))) {
+        return HS_INVALID;
+    }
+
+    if (unlikely(markScratchInUse(scratch))) {
+        return HS_SCRATCH_IN_USE;
+    }
+    hs_error_t rv = hs_scan_stream_internal(id, data, length, flags, scratch,
+                                            onEvent, context);
+    unmarkScratchInUse(scratch);
+    return rv;
+}
+
+HS_PUBLIC_API
+hs_error_t HS_CDECL hs_close_stream(hs_stream_t *id, hs_scratch_t *scratch,
+                                    match_event_handler onEvent,
+                                    void *context) {
+    if (!id) {
+        return HS_INVALID;
+    }
+
+    if (onEvent) {
+        if (!scratch || !validScratch(id->rose, scratch)) {
+            return HS_INVALID;
+        }
+        if (unlikely(markScratchInUse(scratch))) {
+            return HS_SCRATCH_IN_USE;
+        }
+        report_eod_matches(id, scratch, onEvent, context);
+        if (unlikely(internal_matching_error(scratch))) {
+            unmarkScratchInUse(scratch);
+            return HS_UNKNOWN_ERROR;
+        }
+        unmarkScratchInUse(scratch);
+    }
+
+    hs_stream_free(id);
+
+    return HS_SUCCESS;
+}
+
+HS_PUBLIC_API
+hs_error_t HS_CDECL hs_reset_stream(hs_stream_t *id, UNUSED unsigned int flags,
+                                    hs_scratch_t *scratch,
+                                    match_event_handler onEvent,
+                                    void *context) {
+    if (!id) {
+        return HS_INVALID;
+    }
+
+    if (onEvent) {
+        if (!scratch || !validScratch(id->rose, scratch)) {
+            return HS_INVALID;
+        }
+        if (unlikely(markScratchInUse(scratch))) {
+            return HS_SCRATCH_IN_USE;
+        }
+        report_eod_matches(id, scratch, onEvent, context);
+        if (unlikely(internal_matching_error(scratch))) {
+            unmarkScratchInUse(scratch);
+            return HS_UNKNOWN_ERROR;
+        }
+        unmarkScratchInUse(scratch);
+    }
+
+    // history already initialised
+    init_stream(id, id->rose, 0);
+
+    return HS_SUCCESS;
+}
+
+HS_PUBLIC_API
+hs_error_t HS_CDECL hs_stream_size(const hs_database_t *db,
+                                   size_t *stream_size) {
+    if (!stream_size) {
+        return HS_INVALID;
+    }
+
+    hs_error_t ret = validDatabase(db);
+    if (ret != HS_SUCCESS) {
+        return ret;
+    }
+
+    const struct RoseEngine *rose = hs_get_bytecode(db);
+    if (!ISALIGNED_16(rose)) {
+        return HS_INVALID;
+    }
+
+    if (rose->mode != HS_MODE_STREAM) {
+        return HS_DB_MODE_ERROR;
+    }
+
+    u32 base_stream_size = rose->stateOffsets.end;
+
+    // stream state plus the hs_stream struct itself
+    *stream_size = base_stream_size + sizeof(struct hs_stream);
+
+    return HS_SUCCESS;
+}
+
+#if defined(DEBUG) || defined(DUMP_SUPPORT)
+#include "util/compare.h"
+// A debugging crutch: print a hex-escaped version of the match for our
+// perusal.
+static UNUSED
+void dumpData(const char *data, size_t len) {
+    DEBUG_PRINTF("BUFFER:");
+    for (size_t i = 0; i < len; i++) {
+        u8 c = data[i];
+        if (ourisprint(c) && c != '\'') {
+            DEBUG_PRINTF("%c", c);
+        } else {
+            DEBUG_PRINTF("\\x%02x", c);
+        }
+    }
+    DEBUG_PRINTF("\n");
+}
+#endif
+
+HS_PUBLIC_API
+hs_error_t HS_CDECL hs_scan_vector(const hs_database_t *db,
+                                   const char * const * data,
+                                   const unsigned int *length,
+                                   unsigned int count,
+                                   UNUSED unsigned int flags,
+                                   hs_scratch_t *scratch,
+                                   match_event_handler onEvent, void *context) {
+    if (unlikely(!scratch || !data || !length)) {
+        return HS_INVALID;
+    }
+
+    hs_error_t err = validDatabase(db);
+    if (unlikely(err != HS_SUCCESS)) {
+        return err;
+    }
+
+    const struct RoseEngine *rose = hs_get_bytecode(db);
+    if (unlikely(!ISALIGNED_16(rose))) {
+        return HS_INVALID;
+    }
+
+    if (unlikely(rose->mode != HS_MODE_VECTORED)) {
+        return HS_DB_MODE_ERROR;
+    }
+
+    if (unlikely(!validScratch(rose, scratch))) {
+        return HS_INVALID;
+    }
+
+    if (unlikely(markScratchInUse(scratch))) {
+        return HS_SCRATCH_IN_USE;
+    }
+
+    hs_stream_t *id = (hs_stream_t *)(scratch->bstate);
+
+    init_stream(id, rose, 1); /* open stream */
+
+    for (u32 i = 0; i < count; i++) {
+        DEBUG_PRINTF("block %u/%u offset=%llu len=%u\n", i, count, id->offset,
+                     length[i]);
+#ifdef DEBUG
+        dumpData(data[i], length[i]);
+#endif
+        hs_error_t ret
+            = hs_scan_stream_internal(id, data[i], length[i], 0, scratch,
+                                      onEvent, context);
+        if (ret != HS_SUCCESS) {
+            unmarkScratchInUse(scratch);
+            return ret;
+        }
+    }
+
+    /* close stream */
+    if (onEvent) {
+        report_eod_matches(id, scratch, onEvent, context);
+
+        if (unlikely(internal_matching_error(scratch))) {
+            unmarkScratchInUse(scratch);
+            return HS_UNKNOWN_ERROR;
+        } else if (told_to_stop_matching(scratch)) {
+            unmarkScratchInUse(scratch);
+            return HS_SCAN_TERMINATED;
+        }
+    }
+
+    unmarkScratchInUse(scratch);
+
+    return HS_SUCCESS;
+}
+
+HS_PUBLIC_API
+hs_error_t HS_CDECL hs_scan_tfwstr(const hs_database_t *db,
+                                   const void *data,/*TfwStr*/
+                                   UNUSED unsigned int flags,
+                                   hs_scratch_t *scratch,
+                                   match_event_handler onEvent, void *context) {
+
+        const TfwStr *chunk, *end, *str;
+
+        if (unlikely(!scratch || !data )) {
+                return HS_INVALID;
+        }
+
+        str = (const TfwStr *)data;
+
+        hs_error_t err = validDatabase(db);
+        if (unlikely(err != HS_SUCCESS)) {
+                return err;
+        }
+
+        const struct RoseEngine *rose = hs_get_bytecode(db);
+        if (unlikely(!ISALIGNED_16(rose))) {
+                return HS_INVALID;
+        }
+
+        if (unlikely(rose->mode != HS_MODE_VECTORED)) {
+                return HS_DB_MODE_ERROR;
+        }
+
+        if (unlikely(!validScratch(rose, scratch))) {
+                return HS_INVALID;
+        }
+
+        if (unlikely(markScratchInUse(scratch))) {
+                return HS_SCRATCH_IN_USE;
+        }
+
+        hs_stream_t *id = (hs_stream_t *)(scratch->bstate);
+
+        init_stream(id, rose, 1); /* open stream */
+
+
+        TFW_STR_FOR_EACH_CHUNK(chunk, str, end) {
+                DEBUG_PRINTF("offset=%llu len=%lu\n", id->offset, chunk->len);
+#ifdef DEBUG
+                dumpData(chunk->data, chunk->len);
+#endif
+                hs_error_t ret
+                        = hs_scan_stream_internal(id, chunk->data, chunk->len, 0, scratch,
+                                          onEvent, context);
+                if (ret != HS_SUCCESS) {
+                        unmarkScratchInUse(scratch);
+                        return ret;
+                }
+        }
+
+        /* close stream */
+        if (onEvent) {
+                report_eod_matches(id, scratch, onEvent, context);
+
+                if (unlikely(internal_matching_error(scratch))) {
+                        unmarkScratchInUse(scratch);
+                        return HS_UNKNOWN_ERROR;
+                } else if (told_to_stop_matching(scratch)) {
+                        unmarkScratchInUse(scratch);
+                        return HS_SCAN_TERMINATED;
+                }
+        }
+
+        unmarkScratchInUse(scratch);
+        return HS_SUCCESS;
+}
+
+HS_PUBLIC_API
+hs_error_t HS_CDECL hs_compress_stream(const hs_stream_t *stream, char *buf,
+                                       size_t buf_space, size_t *used_space) {
+    if (unlikely(!stream || !used_space)) {
+        return HS_INVALID;
+    }
+
+    if (unlikely(buf_space && !buf)) {
+        return HS_INVALID;
+    }
+
+    const struct RoseEngine *rose = stream->rose;
+
+    size_t stream_size = size_compress_stream(rose, stream);
+
+    DEBUG_PRINTF("require %zu [orig %zu]\n", stream_size,
+                 rose->stateOffsets.end + sizeof(struct hs_stream));
+    *used_space = stream_size;
+
+    if (buf_space < stream_size) {
+        return HS_INSUFFICIENT_SPACE;
+    }
+    compress_stream(buf, stream_size, rose, stream);
+
+    return HS_SUCCESS;
+}
+
+HS_PUBLIC_API
+hs_error_t HS_CDECL hs_expand_stream(const hs_database_t *db,
+                                     hs_stream_t **stream,
+                                     const char *buf, size_t buf_size) {
+    if (unlikely(!stream || !buf)) {
+        return HS_INVALID;
+    }
+
+    *stream = NULL;
+
+    hs_error_t err = validDatabase(db);
+    if (unlikely(err != HS_SUCCESS)) {
+        return err;
+    }
+
+    const struct RoseEngine *rose = hs_get_bytecode(db);
+    if (unlikely(!ISALIGNED_16(rose))) {
+        return HS_INVALID;
+    }
+
+    if (unlikely(rose->mode != HS_MODE_STREAM)) {
+        return HS_DB_MODE_ERROR;
+    }
+
+    size_t stream_size = rose->stateOffsets.end + sizeof(struct hs_stream);
+
+    struct hs_stream *s = hs_stream_alloc(stream_size);
+    if (unlikely(!s)) {
+        return HS_NOMEM;
+    }
+
+    if (!expand_stream(s, rose, buf, buf_size)) {
+        hs_stream_free(s);
+        return HS_INVALID;
+    }
+
+    *stream = s;
+    return HS_SUCCESS;
+}
+
+HS_PUBLIC_API
+hs_error_t HS_CDECL hs_reset_and_expand_stream(hs_stream_t *to_stream,
+                                               const char *buf, size_t buf_size,
+                                               hs_scratch_t *scratch,
+                                               match_event_handler onEvent,
+                                               void *context) {
+    if (unlikely(!to_stream || !buf)) {
+        return HS_INVALID;
+    }
+
+    const struct RoseEngine *rose = to_stream->rose;
+
+    if (onEvent) {
+        if (!scratch || !validScratch(to_stream->rose, scratch)) {
+            return HS_INVALID;
+        }
+        if (unlikely(markScratchInUse(scratch))) {
+            return HS_SCRATCH_IN_USE;
+        }
+        report_eod_matches(to_stream, scratch, onEvent, context);
+        if (unlikely(internal_matching_error(scratch))) {
+            unmarkScratchInUse(scratch);
+            return HS_UNKNOWN_ERROR;
+        }
+        unmarkScratchInUse(scratch);
+    }
+
+    if (expand_stream(to_stream, rose, buf, buf_size)) {
+        return HS_SUCCESS;
+    } else {
+        return HS_INVALID;
+    }
+}
diff --git a/regex/scratch.c b/regex/scratch.c
new file mode 100644
index 000000000..1e620fe73
--- /dev/null
+++ b/regex/scratch.c
@@ -0,0 +1,466 @@
+/*
+ * Copyright (c) 2015-2019, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Functions for allocating and manipulating scratch space.
+ */
+
+#ifndef __KERNEL__
+#include <stdlib.h>
+#include <string.h>
+#else
+#include <linux/types.h>
+#include <linux/string.h>
+#endif
+
+#include "allocator.h"
+#include "hs_internal.h"
+#include "hs_runtime.h"
+#include "scratch.h"
+#include "state.h"
+#include "ue2common.h"
+#include "database.h"
+#include "nfa/nfa_api_queue.h"
+#include "rose/rose_internal.h"
+#include "util/fatbit.h"
+
+/**
+ * Determine the space required for a correctly aligned array of fatbit
+ * structure, laid out as:
+ *
+ * - an array of num_entries pointers, each to a fatbit.
+ * - an array of fatbit structures, each of size fatbit_len.
+ *
+ * fatbit_len should have been determined at compile time, via the
+ * fatbit_size() call.
+ */
+static
+size_t fatbit_array_size(u32 num_entries, u32 fatbit_len) {
+    size_t len = 0;
+
+    // Array of pointers to each fatbit entry.
+    len += sizeof(struct fatbit *) * num_entries;
+
+    // Fatbit entries themselves.
+    len = ROUNDUP_N(len, alignof(struct fatbit));
+    len += (size_t)fatbit_len * num_entries;
+
+    return ROUNDUP_N(len, 8); // Round up for potential padding.
+}
+
+/** Used by hs_alloc_scratch and hs_clone_scratch to allocate a complete
+ * scratch region from a prototype structure. */
+static
+hs_error_t alloc_scratch(const hs_scratch_t *proto, hs_scratch_t **scratch) {
+    u32 queueCount = proto->queueCount;
+    u32 activeQueueArraySize = proto->activeQueueArraySize;
+    u32 deduperCount = proto->deduper.dkey_count;
+    u32 deduperLogSize = proto->deduper.log_size;
+    u32 bStateSize = proto->bStateSize;
+    u32 tStateSize = proto->tStateSize;
+    u32 fullStateSize = proto->fullStateSize;
+    u32 anchored_literal_region_len = proto->anchored_literal_region_len;
+    u32 anchored_literal_fatbit_size = proto->anchored_literal_fatbit_size;
+
+    u32 som_store_size = proto->som_store_count * sizeof(u64a);
+    u32 som_attempted_store_size = proto->som_store_count * sizeof(u64a);
+    u32 som_now_size = proto->som_fatbit_size;
+    u32 som_attempted_size = proto->som_fatbit_size;
+
+    struct hs_scratch *s;
+    struct hs_scratch *s_tmp;
+    size_t queue_size = queueCount * sizeof(struct mq);
+    size_t qmpq_size = queueCount * sizeof(struct queue_match);
+
+    assert(anchored_literal_region_len < 8 * sizeof(s->al_log_sum));
+
+    size_t anchored_literal_region_size = fatbit_array_size(
+        anchored_literal_region_len, proto->anchored_literal_fatbit_size);
+    size_t delay_region_size =
+        fatbit_array_size(DELAY_SLOT_COUNT, proto->delay_fatbit_size);
+
+    // the size is all the allocated stuff, not including the struct itself
+    size_t size = queue_size + 63
+                  + bStateSize + tStateSize
+                  + fullStateSize + 63 /* cacheline padding */
+                  + proto->handledKeyFatbitSize /* handled roles */
+                  + activeQueueArraySize /* active queue array */
+                  + 2 * deduperLogSize /* need odd and even logs */
+                  + 2 * deduperLogSize /* ditto som logs */
+                  + 2 * sizeof(u64a) * deduperCount /* start offsets for som */
+                  + anchored_literal_region_size + qmpq_size
+                  + delay_region_size
+                  + som_store_size
+                  + som_now_size
+                  + som_attempted_size
+                  + som_attempted_store_size + 15;
+
+    /* the struct plus the allocated stuff plus padding for cacheline
+     * alignment */
+    size_t alloc_size = sizeof(struct hs_scratch) + size + 256;
+    if (!*scratch) {
+        s_tmp = hs_scratch_alloc(alloc_size);
+        hs_error_t err = hs_check_alloc(s_tmp);
+        if (err != HS_SUCCESS) {
+            hs_scratch_free(s_tmp);
+            return err;
+        }
+
+        memset(s_tmp, 0, alloc_size);
+        s = ROUNDUP_PTR(s_tmp, 64);
+        DEBUG_PRINTF("allocated %zu bytes at %p but realigning to %p\n", alloc_size, s_tmp, s);
+        DEBUG_PRINTF("sizeof %zu\n", sizeof(struct hs_scratch));
+    } else {
+        s = *scratch;
+        assert(proto->scratchSize == alloc_size);
+        s_tmp = (hs_scratch_t *)s->scratch_alloc;
+    }
+
+    *s = *proto;
+
+    s->magic = SCRATCH_MAGIC;
+    s->in_use = 0;
+    s->scratchSize = alloc_size;
+    s->scratch_alloc = (char *)s_tmp;
+    s->fdr_conf = NULL;
+
+    // each of these is at an offset from the previous
+    char *current = (char *)s + sizeof(*s);
+
+    // align current so that the following arrays are naturally aligned: this
+    // is accounted for in the padding allocated
+    current = ROUNDUP_PTR(current, 8);
+
+    s->queues = (struct mq *)current;
+    current += queue_size;
+
+    assert(ISALIGNED_N(current, 8));
+    s->som_store = (u64a *)current;
+    current += som_store_size;
+
+    s->som_attempted_store = (u64a *)current;
+    current += som_attempted_store_size;
+
+    current = ROUNDUP_PTR(current, alignof(struct fatbit *));
+    s->delay_slots = (struct fatbit **)current;
+    current += sizeof(struct fatbit *) * DELAY_SLOT_COUNT;
+    current = ROUNDUP_PTR(current, alignof(struct fatbit));
+    for (u32 i = 0; i < DELAY_SLOT_COUNT; i++) {
+        s->delay_slots[i] = (struct fatbit *)current;
+        assert(ISALIGNED(s->delay_slots[i]));
+        current += proto->delay_fatbit_size;
+    }
+
+    current = ROUNDUP_PTR(current, alignof(struct fatbit *));
+    s->al_log = (struct fatbit **)current;
+    current += sizeof(struct fatbit *) * anchored_literal_region_len;
+    current = ROUNDUP_PTR(current, alignof(struct fatbit));
+    for (u32 i = 0; i < anchored_literal_region_len; i++) {
+        s->al_log[i] = (struct fatbit *)current;
+        assert(ISALIGNED(s->al_log[i]));
+        current += anchored_literal_fatbit_size;
+    }
+
+    current = ROUNDUP_PTR(current, 8);
+    s->catchup_pq.qm = (struct queue_match *)current;
+    current += qmpq_size;
+
+    s->bstate = (char *)current;
+    s->bStateSize = bStateSize;
+    current += bStateSize;
+
+    s->tstate = (char *)current;
+    s->tStateSize = tStateSize;
+    current += tStateSize;
+
+    current = ROUNDUP_PTR(current, 64);
+
+    assert(ISALIGNED_N(current, 8));
+    s->deduper.som_start_log[0] = (u64a *)current;
+    current += sizeof(u64a) * deduperCount;
+
+    s->deduper.som_start_log[1] = (u64a *)current;
+    current += sizeof(u64a) * deduperCount;
+
+    assert(ISALIGNED_N(current, 8));
+    s->aqa = (struct fatbit *)current;
+    current += activeQueueArraySize;
+
+    s->handled_roles = (struct fatbit *)current;
+    current += proto->handledKeyFatbitSize;
+
+    s->deduper.log[0] = (struct fatbit *)current;
+    current += deduperLogSize;
+
+    s->deduper.log[1] = (struct fatbit *)current;
+    current += deduperLogSize;
+
+    s->deduper.som_log[0] = (struct fatbit *)current;
+    current += deduperLogSize;
+
+    s->deduper.som_log[1] = (struct fatbit *)current;
+    current += deduperLogSize;
+
+    s->som_set_now = (struct fatbit *)current;
+    current += som_now_size;
+
+    s->som_attempted_set = (struct fatbit *)current;
+    current += som_attempted_size;
+
+    current = ROUNDUP_PTR(current, 64);
+    assert(ISALIGNED_CL(current));
+    s->fullState = (char *)current;
+    s->fullStateSize = fullStateSize;
+    current += fullStateSize;
+
+    *scratch = s;
+
+    // Don't get too big for your boots
+    assert((size_t)(current - (char *)s) <= alloc_size);
+
+    // Init q->scratch ptr for every queue.
+    for (struct mq *qi = s->queues; qi != s->queues + queueCount; ++qi) {
+        qi->scratch = s;
+    }
+
+    return HS_SUCCESS;
+}
+
+HS_PUBLIC_API
+hs_error_t HS_CDECL hs_alloc_scratch(const hs_database_t *db,
+                                     hs_scratch_t **scratch) {
+    if (!db || !scratch) {
+        return HS_INVALID;
+    }
+
+    /* We need to do some real sanity checks on the database as some users mmap
+     * in old deserialised databases, so this is the first real opportunity we
+     * have to make sure it is sane.
+     */
+    hs_error_t rv = dbIsValid(db);
+    if (rv != HS_SUCCESS) {
+        return rv;
+    }
+
+    /* We can also sanity-check the scratch parameter: if it points to an
+     * existing scratch area, that scratch should have valid magic bits. */
+    if (*scratch != NULL) {
+        /* has to be aligned before we can do anything with it */
+        if (!ISALIGNED_CL(*scratch)) {
+            return HS_INVALID;
+        }
+        if ((*scratch)->magic != SCRATCH_MAGIC) {
+            return HS_INVALID;
+        }
+        if (markScratchInUse(*scratch)) {
+            return HS_SCRATCH_IN_USE;
+        }
+    }
+
+    const struct RoseEngine *rose = hs_get_bytecode(db);
+    int resize = 0;
+
+    hs_scratch_t *proto;
+    hs_scratch_t *proto_tmp = hs_scratch_alloc(sizeof(struct hs_scratch) + 256);
+    hs_error_t proto_ret = hs_check_alloc(proto_tmp);
+    if (proto_ret != HS_SUCCESS) {
+        hs_scratch_free(proto_tmp);
+        if (*scratch) {
+            hs_scratch_free((*scratch)->scratch_alloc);
+        }
+        *scratch = NULL;
+        return proto_ret;
+    }
+
+    proto = ROUNDUP_PTR(proto_tmp, 64);
+
+    if (*scratch) {
+        *proto = **scratch;
+    } else {
+        memset(proto, 0, sizeof(*proto));
+        resize = 1;
+    }
+    proto->scratch_alloc = (char *)proto_tmp;
+
+    if (rose->anchoredDistance > proto->anchored_literal_region_len) {
+        resize = 1;
+        proto->anchored_literal_region_len = rose->anchoredDistance;
+    }
+
+    if (rose->anchored_fatbit_size > proto->anchored_literal_fatbit_size) {
+        resize = 1;
+        proto->anchored_literal_fatbit_size = rose->anchored_fatbit_size;
+    }
+
+    if (rose->delay_fatbit_size > proto->delay_fatbit_size) {
+        resize = 1;
+        proto->delay_fatbit_size = rose->delay_fatbit_size;
+    }
+
+    if (rose->handledKeyFatbitSize > proto->handledKeyFatbitSize) {
+        resize = 1;
+        proto->handledKeyFatbitSize = rose->handledKeyFatbitSize;
+    }
+
+    if (rose->tStateSize > proto->tStateSize) {
+        resize = 1;
+        proto->tStateSize = rose->tStateSize;
+    }
+
+    u32 som_store_count = rose->somLocationCount;
+    if (som_store_count > proto->som_store_count) {
+        resize = 1;
+        proto->som_store_count = som_store_count;
+    }
+
+    if (rose->somLocationFatbitSize > proto->som_fatbit_size) {
+        resize = 1;
+        proto->som_fatbit_size = rose->somLocationFatbitSize;
+    }
+
+    u32 queueCount = rose->queueCount;
+    if (queueCount > proto->queueCount) {
+        resize = 1;
+        proto->queueCount = queueCount;
+    }
+
+    if (rose->activeQueueArraySize > proto->activeQueueArraySize) {
+        resize = 1;
+        proto->activeQueueArraySize = rose->activeQueueArraySize;
+    }
+
+    u32 bStateSize = 0;
+    if (rose->mode == HS_MODE_BLOCK) {
+        bStateSize = rose->stateOffsets.end;
+    } else if (rose->mode == HS_MODE_VECTORED) {
+        /* vectoring database require a full stream state (inc header) */
+        bStateSize = sizeof(struct hs_stream) + rose->stateOffsets.end;
+    }
+
+    if (bStateSize > proto->bStateSize) {
+        resize = 1;
+        proto->bStateSize = bStateSize;
+    }
+
+    u32 fullStateSize = rose->scratchStateSize;
+    if (fullStateSize > proto->fullStateSize) {
+        resize = 1;
+        proto->fullStateSize = fullStateSize;
+    }
+
+    if (rose->dkeyCount > proto->deduper.dkey_count) {
+        resize = 1;
+        proto->deduper.dkey_count = rose->dkeyCount;
+        proto->deduper.log_size = rose->dkeyLogSize;
+    }
+
+    if (resize) {
+        if (*scratch) {
+            hs_scratch_free((*scratch)->scratch_alloc);
+            *scratch = NULL;
+        }
+
+        hs_error_t alloc_ret = alloc_scratch(proto, scratch);
+        hs_scratch_free(proto_tmp); /* kill off temp used for sizing */
+        if (alloc_ret != HS_SUCCESS) {
+            *scratch = NULL;
+            return alloc_ret;
+        }
+    } else {
+        hs_scratch_free(proto_tmp); /* kill off temp used for sizing */
+        unmarkScratchInUse(*scratch);
+    }
+
+    assert(!(*scratch)->in_use);
+    return HS_SUCCESS;
+}
+
+HS_PUBLIC_API
+hs_error_t HS_CDECL hs_clone_scratch(const hs_scratch_t *src,
+                                     hs_scratch_t **dest) {
+    if (!dest || !src || !ISALIGNED_CL(src) || src->magic != SCRATCH_MAGIC) {
+        return HS_INVALID;
+    }
+
+    *dest = NULL;
+    hs_error_t ret = alloc_scratch(src, dest);
+    if (ret != HS_SUCCESS) {
+        *dest = NULL;
+        return ret;
+    }
+
+    assert(!(*dest)->in_use);
+    return HS_SUCCESS;
+}
+
+HS_PUBLIC_API
+hs_error_t HS_CDECL hs_init_scratch(const hs_scratch_t *src, hs_scratch_t *dest) {
+    if (!src || !ISALIGNED_CL(src) || src->magic != SCRATCH_MAGIC)
+        return HS_INVALID;
+    if (!dest || !ISALIGNED_CL(dest))
+        return HS_INVALID;
+
+    memset(dest, 0, src->scratchSize);
+    return alloc_scratch(src, &dest);
+}
+
+HS_PUBLIC_API
+hs_error_t HS_CDECL hs_free_scratch(hs_scratch_t *scratch) {
+    if (scratch) {
+        /* has to be aligned before we can do anything with it */
+        if (!ISALIGNED_CL(scratch)) {
+            return HS_INVALID;
+        }
+        if (scratch->magic != SCRATCH_MAGIC) {
+            return HS_INVALID;
+        }
+        if (markScratchInUse(scratch)) {
+            return HS_SCRATCH_IN_USE;
+        }
+
+        scratch->magic = 0;
+        assert(scratch->scratch_alloc);
+        DEBUG_PRINTF("scratch %p is really at %p : freeing\n", scratch,
+                     scratch->scratch_alloc);
+        hs_scratch_free(scratch->scratch_alloc);
+    }
+
+    return HS_SUCCESS;
+}
+
+HS_PUBLIC_API
+hs_error_t HS_CDECL hs_scratch_size(const hs_scratch_t *scratch, size_t *size) {
+    if (!size || !scratch || !ISALIGNED_CL(scratch) ||
+        scratch->magic != SCRATCH_MAGIC) {
+        return HS_INVALID;
+    }
+
+    *size = scratch->scratchSize;
+
+    return HS_SUCCESS;
+}
diff --git a/regex/scratch.h b/regex/scratch.h
new file mode 100644
index 000000000..1256f7aba
--- /dev/null
+++ b/regex/scratch.h
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2015-2019, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Scratch and associated data structures.
+ *
+ * This header gets pulled into many places (many deep, slow to compile
+ * places). Try to keep the included headers under control.
+ */
+
+#ifndef SCRATCH_H_DA6D4FC06FF410
+#define SCRATCH_H_DA6D4FC06FF410
+
+#include "hs_common.h"
+#include "ue2common.h"
+#include "rose/rose_types.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+UNUSED static const u32 SCRATCH_MAGIC = 0x544F4259;
+
+struct fatbit;
+struct hs_scratch;
+struct RoseEngine;
+struct mq;
+
+struct queue_match {
+    /** \brief used to store the current location of an (suf|out)fix match in
+     * the current buffer.
+     *
+     * As (suf|out)fixes always run in the main buffer and never in history
+     * this number will always be positive (matches at 0 belong to previous
+     * write). Hence we can get away with a size_t rather than the usual s64a
+     * for a location. */
+    size_t loc;
+
+    u32 queue; /**< queue index. */
+};
+
+struct catchup_pq {
+    struct queue_match *qm;
+    u32 qm_size; /**< current size of the priority queue */
+};
+
+/** \brief Status flag: user requested termination. */
+#define STATUS_TERMINATED   (1U << 0)
+
+/** \brief Status flag: it has been determined that it is not possible for this
+ * stream to raise any more matches.
+ *
+ * This may be because all its exhaustion keys are on or for other reasons
+ * (anchored sections not matching). */
+#define STATUS_EXHAUSTED    (1U << 1)
+
+/** \brief Status flag: Rose requires rebuild as delay literal matched in
+ * history. */
+#define STATUS_DELAY_DIRTY  (1U << 2)
+
+/** \brief Status flag: Unexpected Rose program error. */
+#define STATUS_ERROR        (1U << 3)
+
+/** \brief Core information about the current scan, used everywhere. */
+struct core_info {
+    void *userContext; /**< user-supplied context */
+
+    /** \brief user-supplied match callback */
+    int (HS_CDECL *userCallback)(unsigned int id, unsigned long long from,
+                                 unsigned long long to, unsigned int flags,
+                                 void *ctx);
+
+    const struct RoseEngine *rose;
+    char *state; /**< full stream state */
+    char *exhaustionVector; /**< pointer to evec for this stream */
+    char *logicalVector; /**< pointer to lvec for this stream */
+    char *combVector; /**< pointer to cvec for this stream */
+    const u8 *buf; /**< main scan buffer */
+    size_t len; /**< length of main scan buffer in bytes */
+    const u8 *hbuf; /**< history buffer */
+    size_t hlen; /**< length of history buffer in bytes. */
+    u64a buf_offset; /**< stream offset, for the base of the buffer */
+    u8 status; /**< stream status bitmask, using STATUS_ flags above */
+};
+
+/** \brief Rose state information. */
+struct RoseContext {
+    u8 mpv_inactive;
+    u64a groups;
+    u64a lit_offset_adjust; /**< offset to add to matches coming from hwlm */
+    u64a delayLastEndOffset; /**< end of the last match from FDR used by delay
+                              * code */
+    u64a lastEndOffset; /**< end of the last match from FDR/anchored DFAs used
+                         * by history code. anchored DFA matches update this
+                         * when they are inserted into the literal match
+                         * stream */
+    u64a lastMatchOffset; /**< last match offset report up out of rose;
+                           * used _only_ for debugging, asserts */
+    u64a lastCombMatchOffset; /**< last match offset of active combinations */
+    u64a minMatchOffset; /**< the earliest offset that we are still allowed to
+                          * report */
+    u64a minNonMpvMatchOffset; /**< the earliest offset that non-mpv engines are
+                                * still allowed to report */
+    u64a next_mpv_offset; /**< earliest offset that the MPV can next report a
+                           * match, cleared if top events arrive */
+    u32 filledDelayedSlots;
+    u32 curr_qi;    /**< currently executing main queue index during
+                     * \ref nfaQueueExec */
+
+    /**
+     * \brief Buffer for caseful long literal support, used in streaming mode
+     * only.
+     *
+     * If a long literal prefix was at the end of the buffer at the end of a
+     * stream write, then the long lit table hashes it and stores the result in
+     * stream state. At the start of the next write, this value is used to set
+     * this buffer to the matching prefix string (stored in the bytecode.
+     */
+    const u8 *ll_buf;
+
+    /** \brief Length in bytes of the string pointed to by ll_buf. */
+    size_t ll_len;
+
+    /** \brief Caseless version of ll_buf. */
+    const u8 *ll_buf_nocase;
+
+    /** \brief Length in bytes of the string pointed to by ll_buf_nocase. */
+    size_t ll_len_nocase;
+};
+
+struct match_deduper {
+    struct fatbit *log[2]; /**< even, odd logs */
+    struct fatbit *som_log[2]; /**< even, odd fatbit logs for som */
+    u64a *som_start_log[2]; /**< even, odd start offset logs for som */
+    u32 dkey_count;
+    u32 log_size;
+    u64a current_report_offset;
+    u8 som_log_dirty;
+};
+
+/** \brief Hyperscan scratch region header.
+ *
+ * NOTE: there is no requirement that scratch is 16-byte aligned, as it is
+ * allocated by a malloc equivalent, possibly supplied by the user.
+ */
+struct ALIGN_CL_DIRECTIVE hs_scratch {
+    u32 magic;
+    u8 in_use; /**< non-zero when being used by an API call. */
+    u32 queueCount;
+    u32 activeQueueArraySize; /**< size of active queue array fatbit in bytes */
+    u32 bStateSize; /**< sizeof block mode states */
+    u32 tStateSize; /**< sizeof transient rose states */
+    u32 fullStateSize; /**< size of uncompressed nfa state */
+    struct RoseContext tctxt;
+    char *bstate; /**< block mode states */
+    char *tstate; /**< state for transient roses */
+    char *fullState; /**< uncompressed NFA state */
+    struct mq *queues;
+    struct fatbit *aqa; /**< active queue array; fatbit of queues that are valid
+                         * & active */
+    struct fatbit **delay_slots;
+    struct fatbit **al_log;
+    u64a al_log_sum;
+    struct catchup_pq catchup_pq;
+    struct core_info core_info;
+    struct match_deduper deduper;
+    u32 anchored_literal_region_len;
+    u32 anchored_literal_fatbit_size; /**< size of each anch fatbit in bytes */
+    struct fatbit *handled_roles; /**< fatbit of ROLES (not states) already
+                                   * handled by this literal */
+    u64a *som_store; /**< array of som locations */
+    u64a *som_attempted_store; /**< array of som locations for fail stores */
+    struct fatbit *som_set_now; /**< fatbit, true if the som location was set
+                                 * based on a match at the current offset */
+    struct fatbit *som_attempted_set; /**< fatbit, true if the som location
+                            * would have been set at the current offset if the
+                            * location had been writable */
+    u64a som_set_now_offset; /**< offset at which som_set_now represents */
+    u32 som_store_count;
+    u32 som_fatbit_size; /**< size of som location fatbit structures in bytes */
+    u32 handledKeyFatbitSize; /**< size of handled_keys fatbit in bytes */
+    u32 delay_fatbit_size; /**< size of each delay fatbit in bytes */
+    u32 scratchSize;
+    char *scratch_alloc; /* user allocated scratch object */
+    u64a *fdr_conf; /**< FDR confirm value */
+    u8 fdr_conf_offset; /**< offset where FDR/Teddy front end matches
+                         * in buffer */
+};
+
+/* array of fatbit ptr; TODO: why not an array of fatbits? */
+static really_inline
+struct fatbit **getAnchoredLiteralLog(struct hs_scratch *scratch) {
+    return scratch->al_log;
+}
+
+static really_inline
+struct fatbit **getDelaySlots(struct hs_scratch *scratch) {
+    return scratch->delay_slots;
+}
+
+static really_inline
+char told_to_stop_matching(const struct hs_scratch *scratch) {
+    return scratch->core_info.status & STATUS_TERMINATED;
+}
+
+static really_inline
+char can_stop_matching(const struct hs_scratch *scratch) {
+    return scratch->core_info.status &
+           (STATUS_TERMINATED | STATUS_EXHAUSTED | STATUS_ERROR);
+}
+
+static really_inline
+char internal_matching_error(const struct hs_scratch *scratch) {
+    return scratch->core_info.status & STATUS_ERROR;
+}
+
+/**
+ * \brief Mark scratch as in use.
+ *
+ * Returns non-zero if it was already in use, zero otherwise.
+ */
+static really_inline
+char markScratchInUse(struct hs_scratch *scratch) {
+    DEBUG_PRINTF("marking scratch as in use\n");
+    assert(scratch && scratch->magic == SCRATCH_MAGIC);
+    if (scratch->in_use) {
+        DEBUG_PRINTF("scratch already in use!\n");
+        return 1;
+    }
+    scratch->in_use = 1;
+    return 0;
+}
+
+/**
+ * \brief Mark scratch as no longer in use.
+ */
+static really_inline
+void unmarkScratchInUse(struct hs_scratch *scratch) {
+    DEBUG_PRINTF("marking scratch as not in use\n");
+    assert(scratch && scratch->magic == SCRATCH_MAGIC);
+    assert(scratch->in_use == 1);
+    scratch->in_use = 0;
+}
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* SCRATCH_H_DA6D4FC06FF410 */
+
diff --git a/regex/smallwrite/smallwrite_internal.h b/regex/smallwrite/smallwrite_internal.h
new file mode 100644
index 000000000..8f350dbea
--- /dev/null
+++ b/regex/smallwrite/smallwrite_internal.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SMALLWRITE_INTERNAL_H
+#define SMALLWRITE_INTERNAL_H
+
+#include "ue2common.h"
+
+// Runtime structure header for SmallWrite.
+struct ALIGN_CL_DIRECTIVE SmallWriteEngine {
+    u32 largestBuffer; /**< largest buffer that can be considered small write */
+    u32 start_offset; /**< where to start scanning in the buffer. */
+    u32 size; /**< size of the small write engine in bytes (including the nfa) */
+};
+
+struct NFA;
+
+static really_inline
+const struct NFA *getSmwrNfa(const struct SmallWriteEngine *smwr) {
+    assert(smwr);
+    const struct NFA *n
+        = (const struct NFA *)((const char *)smwr + sizeof(*smwr));
+    assert(ISALIGNED_CL(n));
+    return n;
+}
+
+#endif // SMALLWRITE_INTERNAL_H
+
diff --git a/regex/som/som_operation.h b/regex/som/som_operation.h
new file mode 100644
index 000000000..d85ad2268
--- /dev/null
+++ b/regex/som/som_operation.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file
+ * \brief SOM runtime: data structures.
+ *
+ * Data structures used for SOM operations.
+ */
+
+#ifndef SOM_OPERATION_H
+#define SOM_OPERATION_H
+
+#include "ue2common.h"
+
+#define SOM_EXTERNAL_CALLBACK_REL                     1
+#define SOM_INTERNAL_LOC_SET                          2
+#define SOM_INTERNAL_LOC_SET_IF_UNSET                 3
+#define SOM_INTERNAL_LOC_SET_IF_WRITABLE              4
+#define SOM_INTERNAL_LOC_SET_REV_NFA                  5
+#define SOM_INTERNAL_LOC_SET_REV_NFA_IF_UNSET         6
+#define SOM_INTERNAL_LOC_SET_REV_NFA_IF_WRITABLE      7
+#define SOM_INTERNAL_LOC_COPY                         8
+#define SOM_INTERNAL_LOC_COPY_IF_WRITABLE             9
+#define SOM_INTERNAL_LOC_MAKE_WRITABLE               10
+#define SOM_EXTERNAL_CALLBACK_STORED                 11
+#define SOM_EXTERNAL_CALLBACK_ABS                    12
+#define SOM_EXTERNAL_CALLBACK_REV_NFA                13
+#define SOM_INTERNAL_LOC_SET_FROM                    14
+#define SOM_INTERNAL_LOC_SET_FROM_IF_WRITABLE        15
+
+struct som_operation {
+    /** \brief Report type, from the definitions above. */
+    u8 type;
+
+    /* \brief SOM loc to modify. */
+    u32 onmatch;
+
+    union {
+        /** \brief SOM distance value, use varies according to type.
+         *
+         *  - for SOM_EXTERNAL_CALLBACK_REL, from-offset is this many bytes
+         *    before the to-offset.
+         *  - for SOM_EXTERNAL_CALLBACK_ABS, set from-offset to this value.
+         *  - for SOM_INTERNAL_LOC_COPY*, som location read_from.
+         */
+        u64a somDistance;
+
+        /** \brief Index of the reverse nfa.
+         *
+         * Used by SOM_EXTERNAL_CALLBACK_REV_NFA and
+         * SOM_INTERNAL_LOC_SET_REV_NFA*.
+         */
+        u64a revNfaIndex;
+    } aux;
+};
+
+#endif // SOM_OPERATION_H
+
diff --git a/regex/som/som_runtime.c b/regex/som/som_runtime.c
new file mode 100644
index 000000000..1a868efc9
--- /dev/null
+++ b/regex/som/som_runtime.c
@@ -0,0 +1,535 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief SOM runtime code.
+ *
+ *
+ * Runtime code for SOM handling called by the Rose callback adaptors.
+ *
+ * Note:
+ * Races between escapes making a som loc writeable and attempts to write to it
+ * at the same to_offset are always resolved as if the escape arrived first
+ * and then the request to write to that location.
+ */
+
+#include "hs_internal.h"
+#include "som_operation.h"
+#include "som_runtime.h"
+#include "scratch.h"
+#include "ue2common.h"
+#include "rose/rose_internal.h"
+#include "nfa/nfa_api.h"
+#include "nfa/nfa_internal.h"
+#include "util/fatbit.h"
+#include "util/multibit.h"
+
+static really_inline
+void setSomLoc(struct fatbit *som_set_now, u64a *som_store, u32 som_store_count,
+               const struct som_operation *ri, u64a to_offset) {
+    /* validity handled by callers */
+    assert(to_offset >= ri->aux.somDistance);
+    u64a start_offset = to_offset - ri->aux.somDistance;
+    u32 som_loc = ri->onmatch;
+
+    /* resolve any races for matches at this point in favour of the earliest som
+     */
+    if (!fatbit_set(som_set_now, som_store_count, som_loc)) {
+        som_store[som_loc] = start_offset;
+    } else {
+        LIMIT_TO_AT_MOST(&som_store[som_loc], start_offset);
+    }
+
+    DEBUG_PRINTF("som_store[%u] set to %llu\n", som_loc, som_store[som_loc]);
+}
+
+static really_inline
+char ok_and_mark_if_write(u8 *som_store_valid, struct fatbit *som_set_now,
+                          u8 *som_store_writable, u32 som_store_count,
+                          u32 loc) {
+    return !mmbit_set(som_store_valid, som_store_count, loc) /* unwritten */
+        || fatbit_isset(som_set_now, som_store_count, loc) /* write here, need
+                                                            * to resolve race */
+        || mmbit_isset(som_store_writable, som_store_count, loc); /* writable */
+}
+
+static really_inline
+char ok_and_mark_if_unset(u8 *som_store_valid, struct fatbit *som_set_now,
+                          u32 som_store_count, u32 loc) {
+    return !mmbit_set(som_store_valid, som_store_count, loc) /* unwritten */
+        || fatbit_isset(som_set_now, som_store_count, loc); /* write here, need
+                                                            * to resolve race */
+}
+
+static
+int somRevCallback(UNUSED u64a start, u64a end, ReportID id, void *ctx) {
+    DEBUG_PRINTF("offset=%llu, id=%u\n", end, id);
+
+    // We use the id to store the offset adjustment (for assertions like a
+    // leading \b or multiline mode).
+    assert(id <= 1);
+    u64a *from_offset = ctx;
+    LIMIT_TO_AT_MOST(from_offset, end + id);
+    return 1; // continue matching.
+}
+
+static really_inline
+const struct NFA *getSomRevNFA(const struct RoseEngine *t, u32 i) {
+    assert(t->somRevOffsetOffset);
+    const u32 *rev_offsets
+        = (const u32 *)((const u8 *)t + t->somRevOffsetOffset);
+    u32 nfa_offset = rev_offsets[i];
+    assert(nfa_offset && nfa_offset < t->size);
+    const struct NFA *n = (const struct NFA *)(((const u8 *)t + nfa_offset));
+    assert(ISALIGNED(n));
+
+    return n;
+}
+
+static
+void runRevNfa(struct hs_scratch *scratch, const struct som_operation *ri,
+               const u64a to_offset, u64a *from_offset) {
+    struct core_info *ci = &scratch->core_info;
+
+    DEBUG_PRINTF("buf has %zu bytes total, history has %zu\n",
+                 ci->len, ci->hlen);
+
+    u32 nfa_idx = ri->aux.revNfaIndex;
+    DEBUG_PRINTF("run rev nfa %u from to_offset=%llu\n", nfa_idx, to_offset);
+    const struct NFA *nfa = getSomRevNFA(ci->rose, nfa_idx);
+
+    assert(nfa->maxWidth); // No inf width rev NFAs.
+
+    size_t buf_bytes = to_offset - ci->buf_offset;
+    size_t history_bytes = ci->hlen;
+
+    DEBUG_PRINTF("nfa min/max widths [%u,%u], %zu in buffer, %zu in history\n",
+                 nfa->minWidth, nfa->maxWidth,  buf_bytes, history_bytes);
+    assert(nfa->minWidth <= buf_bytes + history_bytes);
+
+    const u8 *buf = ci->buf;
+    const u8 *hbuf = ci->hbuf;
+
+    // Work out if we need to scan any history as well.
+    if (history_bytes && buf_bytes < nfa->maxWidth) {
+        assert(hbuf);
+        size_t remainder = nfa->maxWidth - buf_bytes;
+        if (remainder < history_bytes) {
+            hbuf += history_bytes - remainder;
+            history_bytes = remainder;
+        }
+    }
+
+    DEBUG_PRINTF("scanning %zu from buffer and %zu from history\n", buf_bytes,
+                 history_bytes);
+
+    *from_offset = to_offset;
+
+    nfaBlockExecReverse(nfa, to_offset, buf, buf_bytes, hbuf, history_bytes,
+                        somRevCallback, from_offset);
+
+    assert(*from_offset <= to_offset);
+}
+
+static really_inline
+void setSomLocRevNfa(struct hs_scratch *scratch, struct fatbit *som_set_now,
+                     u64a *som_store, u32 som_store_count,
+                     const struct som_operation *ri, u64a to_offset) {
+    /* validity handled by callers */
+    u64a from_offset = 0;
+    runRevNfa(scratch, ri, to_offset, &from_offset);
+
+    u32 som_loc = ri->onmatch;
+
+    /* resolve any races for matches at this point in favour of the earliest som
+     */
+    if (!fatbit_set(som_set_now, som_store_count, som_loc)) {
+        som_store[som_loc] = from_offset;
+    } else {
+        LIMIT_TO_AT_MOST(&som_store[som_loc], from_offset);
+    }
+
+    DEBUG_PRINTF("som_store[%u] set to %llu\n", som_loc, som_store[som_loc]);
+}
+
+void handleSomInternal(struct hs_scratch *scratch,
+                       const struct som_operation *ri, const u64a to_offset) {
+    assert(scratch);
+    assert(ri);
+    DEBUG_PRINTF("-->som action required at %llu\n", to_offset);
+
+    // SOM handling at scan time operates on data held in scratch. In
+    // streaming mode, this data is read from / written out to stream state at
+    // stream write boundaries.
+
+    struct core_info *ci = &scratch->core_info;
+    const struct RoseEngine *rose = ci->rose;
+    assert(rose->hasSom);
+
+    const u32 som_store_count = rose->somLocationCount;
+    u8 *som_store_valid = (u8 *)ci->state + rose->stateOffsets.somValid;
+    u8 *som_store_writable = (u8 *)ci->state + rose->stateOffsets.somWritable;
+    struct fatbit *som_set_now = scratch->som_set_now;
+    struct fatbit *som_attempted_set = scratch->som_attempted_set;
+    u64a *som_store = scratch->som_store;
+    u64a *som_failed_store = scratch->som_attempted_store;
+
+    if (to_offset != scratch->som_set_now_offset) {
+        assert(scratch->som_set_now_offset == ~0ULL
+               || to_offset > scratch->som_set_now_offset);
+        DEBUG_PRINTF("setting som_set_now_offset=%llu\n", to_offset);
+        fatbit_clear(som_set_now);
+        fatbit_clear(som_attempted_set);
+        scratch->som_set_now_offset = to_offset;
+    }
+
+    switch (ri->type) {
+    case SOM_INTERNAL_LOC_SET:
+        DEBUG_PRINTF("SOM_INTERNAL_LOC_SET\n");
+        mmbit_set(som_store_valid, som_store_count, ri->onmatch);
+        setSomLoc(som_set_now, som_store, som_store_count, ri, to_offset);
+        return;
+    case SOM_INTERNAL_LOC_SET_IF_UNSET:
+        DEBUG_PRINTF("SOM_INTERNAL_LOC_SET_IF_UNSET\n");
+        if (ok_and_mark_if_unset(som_store_valid, som_set_now, som_store_count,
+                                 ri->onmatch)) {
+            setSomLoc(som_set_now, som_store, som_store_count, ri, to_offset);
+        }
+        return;
+    case SOM_INTERNAL_LOC_SET_IF_WRITABLE: {
+        u32 slot = ri->onmatch;
+        DEBUG_PRINTF("SOM_INTERNAL_LOC_SET_IF_WRITABLE\n");
+        if (ok_and_mark_if_write(som_store_valid, som_set_now,
+                                 som_store_writable, som_store_count, slot)) {
+            setSomLoc(som_set_now, som_store, som_store_count, ri, to_offset);
+            mmbit_unset(som_store_writable, som_store_count, slot);
+        } else {
+            /* not writable, stash as an attempted write in case we are
+             * racing our escape. */
+            DEBUG_PRINTF("not writable, stashing attempt\n");
+            assert(to_offset >= ri->aux.somDistance);
+            u64a start_offset = to_offset - ri->aux.somDistance;
+
+            if (!fatbit_set(som_attempted_set, som_store_count, slot)) {
+                som_failed_store[slot] = start_offset;
+            } else {
+                LIMIT_TO_AT_MOST(&som_failed_store[slot], start_offset);
+            }
+            DEBUG_PRINTF("som_failed_store[%u] = %llu\n", slot,
+                         som_failed_store[slot]);
+        }
+        return;
+    }
+    case SOM_INTERNAL_LOC_SET_REV_NFA:
+        DEBUG_PRINTF("SOM_INTERNAL_LOC_SET_REV_NFA\n");
+        mmbit_set(som_store_valid, som_store_count, ri->onmatch);
+        setSomLocRevNfa(scratch, som_set_now, som_store, som_store_count, ri,
+                        to_offset);
+        return;
+    case SOM_INTERNAL_LOC_SET_REV_NFA_IF_UNSET:
+        DEBUG_PRINTF("SOM_INTERNAL_LOC_SET_REV_NFA_IF_UNSET\n");
+        if (ok_and_mark_if_unset(som_store_valid, som_set_now, som_store_count,
+                                 ri->onmatch)) {
+            setSomLocRevNfa(scratch, som_set_now, som_store, som_store_count,
+                            ri, to_offset);
+        }
+        return;
+    case SOM_INTERNAL_LOC_SET_REV_NFA_IF_WRITABLE: {
+        u32 slot = ri->onmatch;
+        DEBUG_PRINTF("SOM_INTERNAL_LOC_SET_IF_WRITABLE\n");
+        if (ok_and_mark_if_write(som_store_valid, som_set_now,
+                                 som_store_writable, som_store_count, slot)) {
+            setSomLocRevNfa(scratch, som_set_now, som_store, som_store_count,
+                            ri, to_offset);
+            mmbit_unset(som_store_writable, som_store_count, slot);
+        } else {
+            /* not writable, stash as an attempted write in case we are
+             * racing our escape. */
+            DEBUG_PRINTF("not writable, stashing attempt\n");
+
+            u64a from_offset = 0;
+            runRevNfa(scratch, ri, to_offset, &from_offset);
+
+            if (!fatbit_set(som_attempted_set, som_store_count, slot)) {
+                som_failed_store[slot] = from_offset;
+            } else {
+                LIMIT_TO_AT_MOST(&som_failed_store[slot], from_offset);
+            }
+            DEBUG_PRINTF("som_failed_store[%u] = %llu\n", slot,
+                         som_failed_store[slot]);
+        }
+        return;
+    }
+    case SOM_INTERNAL_LOC_COPY: {
+        u32 slot_in = ri->aux.somDistance;
+        u32 slot_out = ri->onmatch;
+        DEBUG_PRINTF("SOM_INTERNAL_LOC_COPY S[%u] = S[%u]\n", slot_out,
+                     slot_in);
+        assert(mmbit_isset(som_store_valid, som_store_count, slot_in));
+        mmbit_set(som_store_valid, som_store_count, slot_out);
+        fatbit_set(som_set_now, som_store_count, slot_out);
+        som_store[slot_out] = som_store[slot_in];
+
+        return;
+    }
+    case SOM_INTERNAL_LOC_COPY_IF_WRITABLE: {
+        u32 slot_in = ri->aux.somDistance;
+        u32 slot_out = ri->onmatch;
+        DEBUG_PRINTF("SOM_INTERNAL_LOC_COPY_IF_WRITABLE S[%u] = S[%u]\n",
+                     slot_out, slot_in);
+        assert(mmbit_isset(som_store_valid, som_store_count, slot_in));
+        if (ok_and_mark_if_write(som_store_valid, som_set_now,
+                                 som_store_writable, som_store_count,
+                                 slot_out)) {
+            DEBUG_PRINTF("copy, set som_store[%u]=%llu\n", slot_out,
+                         som_store[slot_in]);
+            som_store[slot_out] = som_store[slot_in];
+            fatbit_set(som_set_now, som_store_count, slot_out);
+            mmbit_unset(som_store_writable, som_store_count, slot_out);
+        } else {
+            /* not writable, stash as an attempted write in case we are
+             * racing our escape */
+            DEBUG_PRINTF("not writable, stashing attempt\n");
+            fatbit_set(som_attempted_set, som_store_count, slot_out);
+            som_failed_store[slot_out] = som_store[slot_in];
+            DEBUG_PRINTF("som_failed_store[%u] = %llu\n", slot_out,
+                         som_failed_store[slot_out]);
+        }
+        return;
+    }
+    case SOM_INTERNAL_LOC_MAKE_WRITABLE: {
+        u32 slot = ri->onmatch;
+        DEBUG_PRINTF("SOM_INTERNAL_LOC_MAKE_WRITABLE\n");
+        /* if just written to the loc, ignore the racing escape */
+        if (fatbit_isset(som_set_now, som_store_count, slot)) {
+            DEBUG_PRINTF("just written\n");
+            return;
+        }
+        if (fatbit_isset(som_attempted_set, som_store_count, slot)) {
+            /* writes were waiting for an escape to arrive */
+            DEBUG_PRINTF("setting som_store[%u] = %llu from "
+                         "som_failed_store[%u]\n", slot, som_failed_store[slot],
+                         slot);
+            som_store[slot] = som_failed_store[slot];
+            fatbit_set(som_set_now, som_store_count, slot);
+            return;
+        }
+        mmbit_set(som_store_writable, som_store_count, slot);
+        return;
+    }
+    default:
+        DEBUG_PRINTF("unknown report type!\n");
+        break;
+    }
+
+    // All valid som_operation types should be handled and returned above.
+    assert(0);
+    return;
+}
+
+// Returns the SOM offset.
+u64a handleSomExternal(struct hs_scratch *scratch,
+                       const struct som_operation *ri,
+                       const u64a to_offset) {
+    assert(scratch);
+    assert(ri);
+
+    // SOM handling at scan time operates on data held in scratch. In
+    // streaming mode, this data is read from / written out to stream state at
+    // stream write boundaries.
+
+    struct core_info *ci = &scratch->core_info;
+    const struct RoseEngine *rose = ci->rose;
+    assert(rose->hasSom);
+
+    switch (ri->type) {
+    case SOM_EXTERNAL_CALLBACK_REL:
+        DEBUG_PRINTF("SOM_EXTERNAL_CALLBACK_REL: som is %llu chars back\n",
+                     ri->aux.somDistance);
+        assert(to_offset >= ri->aux.somDistance);
+        return to_offset - ri->aux.somDistance;
+    case SOM_EXTERNAL_CALLBACK_ABS:
+        DEBUG_PRINTF("SOM_EXTERNAL_CALLBACK_ABS: som is at %llu\n",
+                     ri->aux.somDistance);
+        assert(to_offset >= ri->aux.somDistance);
+        return ri->aux.somDistance;
+    case SOM_EXTERNAL_CALLBACK_STORED: {
+        const u64a *som_store = scratch->som_store;
+        u32 slot = ri->aux.somDistance;
+        DEBUG_PRINTF("SOM_EXTERNAL_CALLBACK_STORED: <- som_store[%u]=%llu\n",
+                     slot, som_store[slot]);
+
+        UNUSED const u32 som_store_count = rose->somLocationCount;
+        UNUSED const u8 *som_store_valid = (u8 *)ci->state
+            + rose->stateOffsets.somValid;
+
+        assert(mmbit_isset(som_store_valid, som_store_count, slot));
+        return som_store[slot];
+    }
+    case SOM_EXTERNAL_CALLBACK_REV_NFA: {
+        DEBUG_PRINTF("SOM_EXTERNAL_CALLBACK_REV_NFA\n");
+        u64a from_offset = 0;
+        runRevNfa(scratch, ri, to_offset, &from_offset);
+        return from_offset;
+    }
+    default:
+        DEBUG_PRINTF("unknown report type!\n");
+        break;
+    }
+
+    // All valid som_operation types should be handled and returned above.
+    assert(0);
+    return 0;
+}
+
+void setSomFromSomAware(struct hs_scratch *scratch,
+                        const struct som_operation *ri, u64a from_offset,
+                        u64a to_offset) {
+    assert(scratch);
+    assert(ri);
+    assert(to_offset);
+    assert(ri->type == SOM_INTERNAL_LOC_SET_FROM
+           || ri->type == SOM_INTERNAL_LOC_SET_FROM_IF_WRITABLE);
+
+    struct core_info *ci = &scratch->core_info;
+    const struct RoseEngine *rose = ci->rose;
+    assert(rose->hasSom);
+
+    const u32 som_store_count = rose->somLocationCount;
+    u8 *som_store_valid = (u8 *)ci->state + rose->stateOffsets.somValid;
+    u8 *som_store_writable = (u8 *)ci->state + rose->stateOffsets.somWritable;
+    struct fatbit *som_set_now = scratch->som_set_now;
+    struct fatbit *som_attempted_set = scratch->som_attempted_set;
+    u64a *som_store = scratch->som_store;
+    u64a *som_failed_store = scratch->som_attempted_store;
+
+    if (to_offset != scratch->som_set_now_offset) {
+        DEBUG_PRINTF("setting som_set_now_offset=%llu\n", to_offset);
+        fatbit_clear(som_set_now);
+        fatbit_clear(som_attempted_set);
+        scratch->som_set_now_offset = to_offset;
+    }
+
+    if (ri->type == SOM_INTERNAL_LOC_SET_FROM) {
+        DEBUG_PRINTF("SOM_INTERNAL_LOC_SET_FROM\n");
+        mmbit_set(som_store_valid, som_store_count, ri->onmatch);
+        setSomLoc(som_set_now, som_store, som_store_count, ri, from_offset);
+    } else {
+        DEBUG_PRINTF("SOM_INTERNAL_LOC_SET_FROM_IF_WRITABLE\n");
+        if (ok_and_mark_if_write(som_store_valid, som_set_now,
+                                 som_store_writable, som_store_count,
+                                 ri->onmatch)) {
+            setSomLoc(som_set_now, som_store, som_store_count, ri, from_offset);
+            mmbit_unset(som_store_writable, som_store_count, ri->onmatch);
+        } else {
+            /* not writable, stash as an attempted write in case we are
+             * racing our escape. */
+            DEBUG_PRINTF("not writable, stashing attempt\n");
+            assert(to_offset >= ri->aux.somDistance);
+            u32 som_loc = ri->onmatch;
+
+            if (!fatbit_set(som_attempted_set, som_store_count, ri->onmatch)) {
+                som_failed_store[som_loc] = from_offset;
+            } else {
+                LIMIT_TO_AT_MOST(&som_failed_store[som_loc], from_offset);
+            }
+            DEBUG_PRINTF("som_failed_store[%u] = %llu\n", som_loc,
+                         som_failed_store[som_loc]);
+        }
+    }
+}
+
+static really_inline
+int clearSomLog(struct hs_scratch *scratch, u64a offset, struct fatbit *log,
+                const u64a *starts) {
+    DEBUG_PRINTF("at %llu\n", offset);
+    struct core_info *ci = &scratch->core_info;
+    const struct RoseEngine *rose = ci->rose;
+    const u32 dkeyCount = rose->dkeyCount;
+    const u32 *dkey_to_report = (const u32 *)
+        ((const char *)rose + rose->invDkeyOffset);
+    u32 flags = 0;
+#ifndef RELEASE_BUILD
+    if (scratch->deduper.current_report_offset != offset) {
+        flags |= HS_MATCH_FLAG_ADJUSTED;
+    }
+#endif
+
+    for (u32 it = fatbit_iterate(log, dkeyCount, MMB_INVALID);
+             it != MMB_INVALID; it = fatbit_iterate(log, dkeyCount, it)) {
+        u64a from_offset = starts[it];
+        u32 onmatch = dkey_to_report[it];
+        int halt = ci->userCallback(onmatch, from_offset, offset, flags,
+                                    ci->userContext);
+        if (halt) {
+            ci->status |= STATUS_TERMINATED;
+            return 1;
+        }
+    }
+    fatbit_clear(log);
+    return 0;
+}
+
+int flushStoredSomMatches_i(struct hs_scratch *scratch, u64a offset) {
+    DEBUG_PRINTF("flush som matches\n");
+    int halt = 0;
+
+    assert(!told_to_stop_matching(scratch));
+
+    if (scratch->deduper.current_report_offset == ~0ULL) {
+        /* no matches recorded yet; just need to clear the logs */
+        fatbit_clear(scratch->deduper.som_log[0]);
+        fatbit_clear(scratch->deduper.som_log[1]);
+        scratch->deduper.som_log_dirty = 0;
+        return 0;
+    }
+
+    /* fire any reports from the logs and clear them */
+    if (offset == scratch->deduper.current_report_offset + 1) {
+        struct fatbit *done_log = scratch->deduper.som_log[offset % 2];
+        u64a *done_starts = scratch->deduper.som_start_log[offset % 2];
+
+        halt = clearSomLog(scratch, scratch->deduper.current_report_offset - 1,
+                           done_log, done_starts);
+        scratch->deduper.som_log_dirty >>= 1;
+    } else {
+        /* need to report both logs */
+        u64a f_offset = scratch->deduper.current_report_offset - 1;
+        u64a s_offset = scratch->deduper.current_report_offset;
+        struct fatbit *first_log = scratch->deduper.som_log[f_offset % 2];
+        u64a *first_starts = scratch->deduper.som_start_log[f_offset % 2];
+        struct fatbit *second_log = scratch->deduper.som_log[s_offset % 2];
+        u64a *second_starts = scratch->deduper.som_start_log[s_offset % 2];
+
+        halt = clearSomLog(scratch, f_offset, first_log, first_starts) ||
+               clearSomLog(scratch, s_offset, second_log, second_starts);
+        scratch->deduper.som_log_dirty = 0;
+    }
+
+    return halt;
+}
diff --git a/regex/som/som_runtime.h b/regex/som/som_runtime.h
new file mode 100644
index 000000000..30c7ace8c
--- /dev/null
+++ b/regex/som/som_runtime.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file
+ * \brief SOM runtime code.
+ *
+ * Runtime code for SOM handling called by the Rose callback adaptors.
+ */
+
+#ifndef SOM_RUNTIME_H
+#define SOM_RUNTIME_H
+
+#include "scratch.h"
+#include "ue2common.h"
+
+struct som_operation;
+
+void handleSomInternal(struct hs_scratch *scratch,
+                       const struct som_operation *ri, const u64a to_offset);
+
+// Returns the from_offset.
+u64a handleSomExternal(struct hs_scratch *scratch,
+                       const struct som_operation *ri, const u64a to_offset);
+
+void setSomFromSomAware(struct hs_scratch *scratch,
+                        const struct som_operation *ri, u64a from_offset,
+                        u64a to_offset);
+
+int flushStoredSomMatches_i(struct hs_scratch *scratch, u64a offset);
+
+static really_inline
+int flushStoredSomMatches(struct hs_scratch *scratch, u64a offset) {
+    if (scratch->deduper.som_log_dirty) {
+        return flushStoredSomMatches_i(scratch, offset);
+    } else {
+        return 0;
+    }
+}
+
+#endif // SOM_RUNTIME_H
+
diff --git a/regex/som/som_stream.c b/regex/som/som_stream.c
new file mode 100644
index 000000000..93ab709ed
--- /dev/null
+++ b/regex/som/som_stream.c
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief SOM streaming runtime code.
+ *
+ * Code in this file handles storing and loading SOM slot information from
+ * stream state.
+ */
+
+#include "scratch.h"
+#include "som_stream.h"
+#include "rose/rose_internal.h"
+#include "util/multibit.h"
+
+// Sentinel values stored in stream state and used to represent an SOM distance
+// that is too far in the past to be stored in the available space in stream
+// state.
+
+#define SOM_SENTINEL_LARGE  (~0ull)
+#define SOM_SENTINEL_MEDIUM (~0u)
+#define SOM_SENTINEL_SMALL  ((u16)~0u)
+
+static really_inline
+void storeSomValue(void *stream_som_store, u64a som_value,
+                   u64a stream_offset, u8 som_size) {
+    // Special case for sentinel value.
+    if (som_value == SOM_SENTINEL_LARGE) {
+        switch (som_size) {
+        case 2:
+            *(u16 *)stream_som_store = SOM_SENTINEL_SMALL;
+            break;
+        case 4:
+            *(u32 *)stream_som_store = SOM_SENTINEL_MEDIUM;
+            break;
+        case 8:
+            *(u64a *)stream_som_store = SOM_SENTINEL_LARGE;
+            break;
+        default:
+            break;
+        }
+        return;
+    }
+
+    assert(som_value <= stream_offset);
+    u64a rel_offset = stream_offset - som_value;
+    DEBUG_PRINTF("rel_offset=%llu\n", rel_offset);
+
+    switch (som_size) {
+    case 2:
+        rel_offset = MIN(rel_offset, SOM_SENTINEL_SMALL);
+        assert(ISALIGNED_N(stream_som_store, alignof(u16)));
+        *(u16 *)stream_som_store = rel_offset;
+        break;
+    case 4:
+        rel_offset = MIN(rel_offset, SOM_SENTINEL_MEDIUM);
+        assert(ISALIGNED_N(stream_som_store, alignof(u32)));
+        *(u32 *)stream_som_store = rel_offset;
+        break;
+    case 8:
+        assert(ISALIGNED_N(stream_som_store, alignof(u64a)));
+        *(u64a *)stream_som_store = rel_offset;
+        break;
+    default:
+        assert(0);
+        break;
+    }
+}
+
+void storeSomToStream(struct hs_scratch *scratch, const u64a offset) {
+    assert(scratch);
+    DEBUG_PRINTF("stream offset %llu\n", offset);
+
+    struct core_info *ci = &scratch->core_info;
+    const struct RoseEngine *rose = ci->rose;
+
+    const u32 som_store_count = rose->somLocationCount;
+    assert(som_store_count); // Caller should ensure that we have work to do.
+
+    u8 *som_store_valid = (u8 *)ci->state + rose->stateOffsets.somValid;
+    char *stream_som_store = ci->state + rose->stateOffsets.somLocation;
+    const u64a *som_store = scratch->som_store;
+    const u8 som_size = rose->somHorizon;
+
+    for (u32 i = mmbit_iterate(som_store_valid, som_store_count, MMB_INVALID);
+         i != MMB_INVALID;
+         i = mmbit_iterate(som_store_valid, som_store_count, i)) {
+        DEBUG_PRINTF("storing %llu in %u\n", som_store[i], i);
+        storeSomValue(stream_som_store + (i * som_size), som_store[i],
+                      offset, som_size);
+    }
+}
+
+static really_inline
+u64a loadSomValue(const void *stream_som_store, u64a stream_offset,
+                  u8 som_size) {
+    u64a rel_offset;
+    switch (som_size) {
+    case 2:
+        assert(ISALIGNED_N(stream_som_store, alignof(u16)));
+        rel_offset = *(const u16 *)stream_som_store;
+        if (rel_offset == SOM_SENTINEL_SMALL) {
+            return SOM_SENTINEL_LARGE;
+        }
+        break;
+    case 4:
+        assert(ISALIGNED_N(stream_som_store, alignof(u32)));
+        rel_offset = *(const u32 *)stream_som_store;
+        if (rel_offset == SOM_SENTINEL_MEDIUM) {
+            return SOM_SENTINEL_LARGE;
+        }
+        break;
+    case 8:
+        assert(ISALIGNED_N(stream_som_store, alignof(u64a)));
+        rel_offset = *(const u64a *)stream_som_store;
+        break;
+    default:
+        assert(0);
+        rel_offset = 0;
+        break;
+    }
+
+    DEBUG_PRINTF("rel_offset=%llu\n", rel_offset);
+    return stream_offset - rel_offset;
+}
+
+void loadSomFromStream(struct hs_scratch *scratch, const u64a offset) {
+    assert(scratch);
+    DEBUG_PRINTF("stream offset %llu\n", offset);
+
+    struct core_info *ci = &scratch->core_info;
+    const struct RoseEngine *rose = ci->rose;
+
+    const u32 som_store_count = rose->somLocationCount;
+    assert(som_store_count); // Caller should ensure that we have work to do.
+
+    const u8 *som_store_valid = (u8 *)ci->state + rose->stateOffsets.somValid;
+    const char *stream_som_store = ci->state + rose->stateOffsets.somLocation;
+    u64a *som_store = scratch->som_store;
+    const u8 som_size = rose->somHorizon;
+
+    for (u32 i = mmbit_iterate(som_store_valid, som_store_count, MMB_INVALID);
+         i != MMB_INVALID;
+         i = mmbit_iterate(som_store_valid, som_store_count, i)) {
+        som_store[i] = loadSomValue(stream_som_store + (i*som_size), offset,
+                                    som_size);
+        DEBUG_PRINTF("loaded %llu from %u\n", som_store[i], i);
+    }
+}
diff --git a/regex/som/som_stream.h b/regex/som/som_stream.h
new file mode 100644
index 000000000..8b62264d1
--- /dev/null
+++ b/regex/som/som_stream.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief SOM streaming runtime code.
+ */
+
+#ifndef SOM_STREAM_H
+#define SOM_STREAM_H
+
+#include "ue2common.h"
+
+struct hs_scratch;
+
+/** \brief Write all SOM slot information from scratch out to stream state
+ * (given the current stream offset). */
+void storeSomToStream(struct hs_scratch *scratch, const u64a offset);
+
+/** \brief Read all SOM slot information from stream state into scratch (given
+ * the current stream offset). */
+void loadSomFromStream(struct hs_scratch *scratch, const u64a offset);
+
+#endif
diff --git a/regex/state.h b/regex/state.h
new file mode 100644
index 000000000..9ade59db4
--- /dev/null
+++ b/regex/state.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Stream state data structures.
+ */
+
+#ifndef STATE_H
+#define STATE_H
+
+#include "hs_runtime.h" /* match_event_handler */
+#include "ue2common.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+struct RoseEngine;
+
+/** \brief Stream context: allocated for each stream.
+ *
+ * struct hs_stream is followed in memory by the main Rose state: history,
+ * exhaustion, individual states, etc. The RoseEngine has the offsets required
+ * to correctly index into the main state structure. The offsets used by the
+ * RoseEngine are based on the end of the hs_stream struct as its size may
+ * vary from platform to platform.
+ */
+struct hs_stream {
+    /** \brief The RoseEngine that this stream is matching against. */
+    const struct RoseEngine *rose;
+
+    /** \brief The current stream offset. */
+    u64a offset;
+};
+
+#define getMultiState(hs_s)      ((char *)(hs_s) + sizeof(*(hs_s)))
+#define getMultiStateConst(hs_s) ((const char *)(hs_s) + sizeof(*(hs_s)))
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/regex/stream_compress.c b/regex/stream_compress.c
new file mode 100644
index 000000000..1f7b01e82
--- /dev/null
+++ b/regex/stream_compress.c
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "stream_compress.h"
+
+#include "state.h"
+#include "nfa/nfa_internal.h"
+#include "rose/rose_internal.h"
+#include "util/multibit.h"
+#include "util/multibit_compress.h"
+#include "util/uniform_ops.h"
+
+#ifndef __KERNEL__
+#include <string.h>
+#else
+#include <linux/string.h>
+#endif
+
+#define COPY_IN(p, sz) do {                             \
+        assert(currOffset + sz <= buf_size);            \
+        memcpy(buf + currOffset, p, sz);                \
+        currOffset += sz;                               \
+        DEBUG_PRINTF("co = %zu\n", currOffset);         \
+    } while (0);
+
+#define COPY_OUT(p, sz) do {                            \
+        if (currOffset + sz > buf_size) {               \
+            return 0;                                   \
+        }                                               \
+        memcpy(p, buf + currOffset, sz);                \
+        currOffset += sz;                               \
+        DEBUG_PRINTF("co = %zu\n", currOffset);         \
+    } while (0);
+
+#define SIZE_COPY_IN(p, sz) do {                        \
+        currOffset += sz;                               \
+        DEBUG_PRINTF("co = %zu\n", currOffset);         \
+    } while (0);
+
+#define COPY_MULTIBIT_IN(p, total_bits) do {                                \
+        size_t sz;                                                          \
+        STREAM_QUAL u8 *bits = (STREAM_QUAL u8 *)p;                         \
+        BUF_QUAL u8 *comp = (BUF_QUAL u8 *)(buf + currOffset);              \
+        if (!mmbit_compress(bits, total_bits, comp, &sz,                    \
+                            buf_size - currOffset)) {                       \
+            return 0; /* error */                                           \
+        }                                                                   \
+        currOffset += sz;                                                   \
+        DEBUG_PRINTF("co = %zu\n", currOffset);                             \
+    } while (0);
+
+#define COPY_MULTIBIT_OUT(p, total_bits) do {                               \
+        size_t sz;                                                          \
+        STREAM_QUAL u8 *bits = (STREAM_QUAL u8 *)p;                         \
+        BUF_QUAL u8 *comp = (BUF_QUAL u8 *)(buf + currOffset);              \
+        if (!mmbit_decompress(bits, total_bits, comp, &sz,                  \
+                              buf_size - currOffset)) {                     \
+            return 0; /* error */                                           \
+        }                                                                   \
+        currOffset += sz;                                                   \
+        DEBUG_PRINTF("co = %zu\n", currOffset);                             \
+    } while (0);
+
+#define COPY_MULTIBIT_SIZE(p, total_bits) do {                              \
+        STREAM_QUAL u8 *bits = (STREAM_QUAL u8 *)p;                         \
+        size_t sz = mmbit_compsize(bits, total_bits);                       \
+        currOffset += sz;                                                   \
+        DEBUG_PRINTF("co = %zu\n", currOffset);                             \
+    } while (0);
+
+#define COPY COPY_OUT
+#define COPY_MULTIBIT COPY_MULTIBIT_OUT
+#define ASSIGN(lhs, rhs) do { lhs = rhs; } while (0)
+#define FN_SUFFIX expand
+#define STREAM_QUAL
+#define BUF_QUAL const
+#include "stream_compress_impl.h"
+
+int expand_stream(struct hs_stream *stream, const struct RoseEngine *rose,
+                  const char *buf, size_t buf_size) {
+    return sc_expand(rose, stream, buf, buf_size);
+}
+
+#define COPY COPY_IN
+#define COPY_MULTIBIT COPY_MULTIBIT_IN
+#define ASSIGN(lhs, rhs) do { } while (0)
+#define FN_SUFFIX compress
+#define STREAM_QUAL const
+#define BUF_QUAL
+#include "stream_compress_impl.h"
+
+size_t compress_stream(char *buf, size_t buf_size,
+                       const struct RoseEngine *rose,
+                       const struct hs_stream *stream) {
+    return sc_compress(rose, stream, buf, buf_size);
+}
+
+#define COPY SIZE_COPY_IN
+#define COPY_MULTIBIT COPY_MULTIBIT_SIZE
+#define ASSIGN(lhs, rhs) do { } while (0)
+#define FN_SUFFIX size
+#define STREAM_QUAL const
+#define BUF_QUAL UNUSED
+#include "stream_compress_impl.h"
+
+size_t size_compress_stream(const struct RoseEngine *rose,
+                            const struct hs_stream *stream) {
+    return sc_size(rose, stream, NULL, 0);
+}
diff --git a/regex/stream_compress.h b/regex/stream_compress.h
new file mode 100644
index 000000000..fb2e5cade
--- /dev/null
+++ b/regex/stream_compress.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Functions for dynamic compress/expand for streams.
+ */
+
+#ifndef STREAM_COMPRESS_H
+#define STREAM_COMPRESS_H
+
+#ifndef __KERNEL__
+#include <stdlib.h>
+#else
+#include <linux/types.h>
+#endif
+
+struct hs_stream;
+struct RoseEngine;
+
+int expand_stream(struct hs_stream *out, const struct RoseEngine *rose,
+                  const char *buf, size_t buf_size);
+
+size_t compress_stream(char *buf, size_t buf_size,
+                       const struct RoseEngine *rose,
+                       const struct hs_stream *src);
+
+size_t size_compress_stream(const struct RoseEngine *rose,
+                            const struct hs_stream *stream);
+
+#endif
diff --git a/regex/stream_compress_impl.h b/regex/stream_compress_impl.h
new file mode 100644
index 000000000..d1ccf5e6d
--- /dev/null
+++ b/regex/stream_compress_impl.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2017-2018, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "util/join.h"
+
+#define COPY_FIELD(x) COPY(&x, sizeof(x))
+#define COPY_LEFTFIXES JOIN(sc_left_, FN_SUFFIX)
+#define COPY_SOM_INFO JOIN(sc_som_, FN_SUFFIX)
+
+static
+size_t COPY_LEFTFIXES(const struct RoseEngine *rose, size_t currOffset,
+                      STREAM_QUAL struct hs_stream *stream,
+                      BUF_QUAL char *buf, UNUSED size_t buf_size) {
+    if (!rose->activeLeftIterOffset) {
+        return currOffset;
+    }
+
+    const struct RoseStateOffsets *so = &rose->stateOffsets;
+    STREAM_QUAL char *stream_body
+        = ((STREAM_QUAL char *)stream) + sizeof(struct hs_stream);
+
+    /* Note: in the expand case the active left array has already been copied
+     * into the stream. */
+    const u8 *ara = (const u8 *)(stream_body + so->activeLeftArray);
+    const u32 arCount = rose->activeLeftCount;
+    const struct LeftNfaInfo *left_table = getLeftTable(rose);
+
+    /* We only want to look at non-transient leftfixes */
+    const struct mmbit_sparse_iter *it = getActiveLeftIter(rose);
+    struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES];
+    u32 dummy;
+    u32 ri = mmbit_sparse_iter_begin(ara, arCount, &dummy, it, si_state);
+    for (; ri != MMB_INVALID;
+         ri = mmbit_sparse_iter_next(ara, arCount, ri, &dummy, it, si_state)) {
+        u32 qi = ri + rose->leftfixBeginQueue;
+        UNUSED const struct LeftNfaInfo *left = left_table + ri;
+        const struct NfaInfo *nfa_info = getNfaInfoByQueue(rose, qi);
+        const struct NFA *nfa = getNfaByInfo(rose, nfa_info);
+
+        COPY(stream_body + nfa_info->stateOffset, nfa->streamStateSize);
+        /* copy the one whole byte for active leftfixes as well */
+        assert(left->lagIndex != ROSE_OFFSET_INVALID);
+        COPY(stream_body + so->leftfixLagTable + left->lagIndex, 1);
+    }
+
+    return currOffset;
+}
+
+static
+size_t COPY_SOM_INFO(const struct RoseEngine *rose, size_t currOffset,
+                     STREAM_QUAL struct hs_stream *stream,
+                     BUF_QUAL char *buf, UNUSED size_t buf_size) {
+    const struct RoseStateOffsets *so = &rose->stateOffsets;
+
+    if (!so->somLocation) {
+        assert(!so->somValid);
+        assert(!so->somWritable);
+        return currOffset;
+    }
+
+    STREAM_QUAL char *stream_body
+        = ((STREAM_QUAL char *)stream) + sizeof(struct hs_stream);
+
+    assert(so->somValid);
+    assert(so->somWritable);
+
+    COPY_MULTIBIT(stream_body + so->somWritable, rose->somLocationCount);
+    COPY_MULTIBIT(stream_body + so->somValid, rose->somLocationCount);
+
+    /* Copy only the som slots which contain valid values. */
+    /* Note: in the expand case the som valid array has been copied in. */
+    const u8 *svalid = (const u8 *)(stream_body + so->somValid);
+    u32 s_count = rose->somLocationCount;
+    u32 s_width = rose->somHorizon;
+    for (u32 slot = mmbit_iterate(svalid, s_count, MMB_INVALID);
+         slot != MMB_INVALID; slot = mmbit_iterate(svalid, s_count, slot)) {
+        COPY(stream_body + so->somLocation + slot * s_width, s_width);
+    }
+
+    return currOffset;
+}
+
+static
+size_t JOIN(sc_, FN_SUFFIX)(const struct RoseEngine *rose,
+                            STREAM_QUAL struct hs_stream *stream,
+                            BUF_QUAL char *buf, UNUSED size_t buf_size) {
+    size_t currOffset = 0;
+    const struct RoseStateOffsets *so = &rose->stateOffsets;
+
+    STREAM_QUAL char *stream_body
+        = ((STREAM_QUAL char *)stream) + sizeof(struct hs_stream);
+
+    COPY_FIELD(stream->offset);
+    ASSIGN(stream->rose, rose);
+
+    COPY(stream_body + ROSE_STATE_OFFSET_STATUS_FLAGS, 1);
+    COPY_MULTIBIT(stream_body + ROSE_STATE_OFFSET_ROLE_MMBIT, rose->rolesWithStateCount);
+
+    /* stream is valid in compress/size, and stream->offset has been set already
+     * on the expand side */
+    u64a offset = stream->offset;
+    u32 history = MIN((u32)offset, rose->historyRequired);
+
+    /* copy the active mmbits */
+    COPY_MULTIBIT(stream_body + so->activeLeafArray, rose->activeArrayCount);
+    COPY_MULTIBIT(stream_body + so->activeLeftArray, rose->activeLeftCount);
+
+    COPY(stream_body + so->longLitState, so->longLitState_size);
+
+    /* Leftlag table will be handled later, for active leftfixes */
+
+    /* anchored table state is not required once we are deep in the stream */
+    if (offset <= rose->anchoredDistance) {
+        COPY(stream_body + so->anchorState, rose->anchorStateSize);
+    }
+
+    COPY(stream_body + so->groups, so->groups_size);
+
+    /* copy the real bits of history */
+    UNUSED u32 hend = so->history + rose->historyRequired;
+    COPY(stream_body + hend - history, history);
+
+    /* copy the exhaustion multibit */
+    COPY_MULTIBIT(stream_body + so->exhausted, rose->ekeyCount);
+
+    /* copy the logical multibit */
+    COPY_MULTIBIT(stream_body + so->logicalVec,
+                  rose->lkeyCount + rose->lopCount);
+
+    /* copy the combination multibit */
+    COPY_MULTIBIT(stream_body + so->combVec, rose->ckeyCount);
+
+    /* copy nfa stream state for endfixes */
+    /* Note: in the expand case the active array has already been copied into
+     * the stream. */
+    const u8 *aa = (const u8 *)(stream_body + so->activeLeafArray);
+    u32 aaCount = rose->activeArrayCount;
+    for (u32 qi = mmbit_iterate(aa, aaCount, MMB_INVALID); qi != MMB_INVALID;
+         qi = mmbit_iterate(aa, aaCount, qi)) {
+        DEBUG_PRINTF("saving stream state for qi=%u\n", qi);
+        const struct NfaInfo *nfa_info = getNfaInfoByQueue(rose, qi);
+        const struct NFA *nfa = getNfaByInfo(rose, nfa_info);
+        COPY(stream_body + nfa_info->stateOffset, nfa->streamStateSize);
+    }
+
+    /* copy nfa stream state for leftfixes */
+    currOffset = COPY_LEFTFIXES(rose, currOffset, stream, buf, buf_size);
+    if (!currOffset) {
+        return 0;
+    }
+
+    currOffset = COPY_SOM_INFO(rose, currOffset, stream, buf, buf_size);
+    if (!currOffset) {
+        return 0;
+    }
+
+    return currOffset;
+}
+
+#undef ASSIGN
+#undef COPY
+#undef COPY_FIELD
+#undef COPT_LEFTFIXES
+#undef COPY_MULTIBIT
+#undef COPY_SOM_INFO
+#undef FN_SUFFIX
+#undef BUF_QUAL
+#undef STREAM_QUAL
diff --git a/regex/ue2common.h b/regex/ue2common.h
new file mode 100644
index 000000000..7b471c8ee
--- /dev/null
+++ b/regex/ue2common.h
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2015-2018, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Core UE2 global types, defines, utilities.
+ *
+ * NOTE WELL: this file is included into both C and C++ source code, so
+ * be sure to remain compatible with both.
+ */
+
+#ifndef UE2COMMON_H
+#define UE2COMMON_H
+
+#include "config.h"
+
+#ifndef __KERNEL__
+
+/* standard types used across ue2 */
+
+// We use the size_t type all over the place, usually defined in stddef.h.
+#include <stddef.h>
+// stdint.h for things like uintptr_t and friends
+#include <stdint.h>
+
+#if defined(__cplusplus)
+# define FALLTHROUGH                    [[fallthrough]]
+#elif !defined(_WIN32) && __has_attribute(__fallthrough__)
+# define FALLTHROUGH                    __attribute__((__fallthrough__))
+#else
+# define FALLTHROUGH                    do {} while (0)  /* fallthrough */
+#endif
+
+/* ick */
+#if defined(_WIN32)
+#define ALIGN_ATTR(x) __declspec(align(x))
+#else
+#define ALIGN_ATTR(x) __attribute__((aligned((x))))
+#endif
+
+#define ALIGN_DIRECTIVE ALIGN_ATTR(16)
+#define ALIGN_AVX_DIRECTIVE ALIGN_ATTR(32)
+#define ALIGN_CL_DIRECTIVE ALIGN_ATTR(64)
+
+typedef signed char s8;
+typedef unsigned char u8;
+typedef signed short s16;
+typedef unsigned short u16;
+typedef unsigned int u32;
+typedef signed int s32;
+
+/* We append the 'a' for aligned, since these aren't common, garden variety
+ * 64 bit values. The alignment is necessary for structs on some platforms,
+ * so we don't end up performing accidental unaligned accesses. */
+#if defined(_WIN32) && ! defined(_WIN64)
+typedef unsigned long long ALIGN_ATTR(4) u64a;
+typedef signed long long ALIGN_ATTR(4) s64a;
+#else
+typedef unsigned long long ALIGN_ATTR(8) u64a;
+typedef signed long long ALIGN_ATTR(8) s64a;
+#endif
+
+/* get the SIMD types */
+#include "util/simd_types.h"
+
+/** \brief Report identifier, used for internal IDs and external IDs (those
+ * reported on match). */
+typedef u32 ReportID;
+
+/* Shorthand for attribute to mark a function as part of our public API.
+ * Functions without this attribute will be hidden. */
+#if !defined(_WIN32)
+#define HS_PUBLIC_API     __attribute__((visibility("default")))
+#else
+// TODO: dllexport defines for windows
+#define HS_PUBLIC_API
+#endif
+
+#define ARRAY_LENGTH(a) (sizeof(a)/sizeof((a)[0]))
+
+/** \brief Shorthand for the attribute to shut gcc about unused parameters */
+#if !defined(_WIN32)
+#define UNUSED __attribute__ ((unused))
+#else
+#define UNUSED
+#endif
+
+/* really_inline forces inlining always */
+#if !defined(_WIN32)
+#if defined(HS_OPTIMIZE)
+#define really_inline inline __attribute__ ((always_inline, unused))
+#else
+#define really_inline __attribute__ ((unused))
+#endif
+
+/** no, seriously, inline it, even if building in debug mode */
+#define really_really_inline inline __attribute__ ((always_inline, unused))
+#define never_inline __attribute__ ((noinline))
+#define alignof __alignof
+#define HAVE_TYPEOF 1
+
+#else // ms windows
+#define really_inline __forceinline
+#define really_really_inline __forceinline
+#define never_inline
+#define __builtin_prefetch(...) do {} while(0)
+#if defined(__cplusplus)
+#define __typeof__ decltype
+#define HAVE_TYPEOF 1
+#else // C
+/* msvc doesn't have decltype or typeof in C */
+#define inline __inline
+#define alignof __alignof
+#endif
+#endif
+
+
+// We use C99-style "restrict".
+#ifdef _WIN32
+#ifdef __cplusplus
+#define restrict
+#else
+#define restrict __restrict
+#endif
+#else
+#define restrict __restrict
+#endif
+
+
+// Align to 16-byte boundary
+#define ROUNDUP_16(a) (((a) + 0xf) & ~0xf)
+#define ROUNDDOWN_16(a) ((a) & ~0xf)
+
+// Align to N-byte boundary
+#define ROUNDUP_N(a, n) (((a) + ((n)-1)) & ~((n)-1))
+#define ROUNDDOWN_N(a, n) ((a) & ~((n)-1))
+
+// Align to a cacheline - assumed to be 64 bytes
+#define ROUNDUP_CL(a) ROUNDUP_N(a, 64)
+
+// Align ptr to next N-byte boundary
+#if defined(HAVE_TYPEOF)
+#define ROUNDUP_PTR(ptr, n)   (__typeof__(ptr))(ROUNDUP_N((uintptr_t)(ptr), (n)))
+#define ROUNDDOWN_PTR(ptr, n) (__typeof__(ptr))(ROUNDDOWN_N((uintptr_t)(ptr), (n)))
+#else
+#define ROUNDUP_PTR(ptr, n)   (void*)(ROUNDUP_N((uintptr_t)(ptr), (n)))
+#define ROUNDDOWN_PTR(ptr, n) (void*)(ROUNDDOWN_N((uintptr_t)(ptr), (n)))
+#endif
+
+#define ISALIGNED_N(ptr, n) (((uintptr_t)(ptr) & ((n) - 1)) == 0)
+#define ISALIGNED_16(ptr)   ISALIGNED_N((ptr), 16)
+#define ISALIGNED_CL(ptr)   ISALIGNED_N((ptr), 64)
+#if defined(HAVE_TYPEOF)
+#define ISALIGNED(ptr)      ISALIGNED_N((ptr), alignof(__typeof__(*(ptr))))
+#else
+/* we should probably avoid using this test in C */
+#define ISALIGNED(ptr)      (1)
+#endif
+#define N_CHARS 256
+
+// Maximum offset representable in the 'unsigned long long' we use to return
+// offset values.
+#define MAX_OFFSET 0xffffffffffffffffULL
+
+#if !defined(MIN)
+  #define MIN(a,b)      ((a) < (b) ? (a) : (b))
+#endif
+#if !defined(MAX)
+  #define MAX(a,b)      ((a) > (b) ? (a) : (b))
+#endif
+
+#define LIMIT_TO_AT_MOST(a, b) (*(a) = MIN(*(a),(b)))
+#define ENSURE_AT_LEAST(a, b) (*(a) = MAX(*(a),(b)))
+
+#ifndef _WIN32
+#ifndef likely
+  #define likely(x)     __builtin_expect(!!(x), 1)
+#endif
+#ifndef unlikely
+  #define unlikely(x)   __builtin_expect(!!(x), 0)
+#endif
+#else
+#define likely(x)   (x)
+#define unlikely(x) (x)
+#endif
+
+#if !defined(RELEASE_BUILD) || defined(DEBUG)
+#ifdef _WIN32
+#define PATH_SEP '\\'
+#else
+#define PATH_SEP '/'
+#endif
+#endif
+
+#if defined(DEBUG) && !defined(DEBUG_PRINTF)
+#include <string.h>
+#include <stdio.h>
+#define DEBUG_PRINTF(format, ...) printf("%s:%s:%d:" format, \
+                                         strrchr(__FILE__, PATH_SEP) + 1, \
+                                         __func__, __LINE__,  ## __VA_ARGS__)
+#elif !defined(DEBUG_PRINTF)
+#define DEBUG_PRINTF(format, ...) pr_notice("%s:%s:%d:" format, \
+                                        strrchr(__FILE__, PATH_SEP) + 1, \
+                                        __func__, __LINE__,  ## __VA_ARGS__)
+#endif
+
+#if !defined(RELEASE_BUILD)
+#include <string.h>
+#include <stdio.h>
+#define ADEBUG_PRINTF(format, ...) printf("!%s:%s:%d:" format, \
+                                          strrchr(__FILE__, PATH_SEP) + 1, \
+                                          __func__, __LINE__,  ## __VA_ARGS__)
+#else
+#define ADEBUG_PRINTF(format, ...) do { } while(0)
+#endif
+
+#include <assert.h>
+
+#else
+#include "ue2common_kern.h"
+#endif
+
+#endif
diff --git a/regex/util/arch.h b/regex/util/arch.h
new file mode 100644
index 000000000..782ad5b2e
--- /dev/null
+++ b/regex/util/arch.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2017-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Per-platform architecture definitions
+ */
+
+#ifndef UTIL_ARCH_H_
+#define UTIL_ARCH_H_
+
+#if !defined(__KERNEL__)
+#if defined(__SSE2__) || defined(_M_X64) || (_M_IX86_FP >= 2)
+#define HAVE_SSE2
+#endif
+
+#if defined(__SSE4_1__) || (defined(_WIN32) && defined(__AVX__))
+#define HAVE_SSE41
+#endif
+
+#if defined(__SSE4_2__) || (defined(_WIN32) && defined(__AVX__))
+#define HAVE_SSE42
+#endif
+
+#if defined(__AVX__)
+#define HAVE_AVX
+#endif
+
+#if defined(__AVX2__)
+#define HAVE_AVX2
+#endif
+
+#if defined(__AVX512BW__)  && defined(BUILD_AVX512)
+#define HAVE_AVX512
+#endif
+
+#if defined(__AVX512VBMI__) && defined(BUILD_AVX512_VBMI)
+#define HAVE_AVX512VBMI
+#endif
+#endif /* __KERNEL__ */
+
+/*
+ * ICC and MSVC don't break out POPCNT or BMI/2 as separate pre-def macros
+ */
+#if defined(__POPCNT__) ||                                                     \
+    (defined(__INTEL_COMPILER) && defined(__SSE4_2__)) ||                      \
+    (defined(_WIN32) && defined(__AVX__))
+#define HAVE_POPCOUNT_INSTR
+#endif
+
+#if defined(__BMI__) || (defined(_WIN32) && defined(__AVX2__)) ||              \
+    (defined(__INTEL_COMPILER) && defined(__AVX2__))
+#define HAVE_BMI
+#endif
+
+#if defined(__BMI2__) || (defined(_WIN32) && defined(__AVX2__)) ||             \
+    (defined(__INTEL_COMPILER) && defined(__AVX2__))
+#define HAVE_BMI2
+#endif
+
+/*
+ * MSVC uses a different form of inline asm
+ */
+#if defined(_WIN32) && defined(_MSC_VER)
+#define NO_ASM
+#endif
+
+#endif // UTIL_ARCH_H_
diff --git a/regex/util/bitutils.h b/regex/util/bitutils.h
new file mode 100644
index 000000000..c545ee187
--- /dev/null
+++ b/regex/util/bitutils.h
@@ -0,0 +1,492 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Bit-twiddling primitives (ctz, compress etc)
+ */
+
+#ifndef BITUTILS_H
+#define BITUTILS_H
+
+#include "ue2common.h"
+#include "popcount.h"
+#include "util/arch.h"
+#include "util/intrinsics.h"
+
+#define CASE_BIT          0x20
+#define CASE_CLEAR        0xdf
+#define DOUBLE_CASE_CLEAR 0xdfdf
+#define OCTO_CASE_CLEAR   0xdfdfdfdfdfdfdfdfULL
+
+static really_inline
+u32 clz32(u32 x) {
+    assert(x); // behaviour not defined for x == 0
+#if defined(_WIN32)
+    unsigned long r;
+    _BitScanReverse(&r, x);
+    return 31 - r;
+#else
+    return (u32)__builtin_clz(x);
+#endif
+}
+
+static really_inline
+u32 clz64(u64a x) {
+    assert(x); // behaviour not defined for x == 0
+#if defined(_WIN64)
+    unsigned long r;
+    _BitScanReverse64(&r, x);
+    return 63 - r;
+#elif defined(_WIN32)
+    unsigned long x1 = (u32)x;
+    unsigned long x2 = (u32)(x >> 32);
+    unsigned long r;
+    if (x2) {
+        _BitScanReverse(&r, x2);
+        return (u32)(31 - r);
+    }
+    _BitScanReverse(&r, (u32)x1);
+    return (u32)(63 - r);
+#else
+    return (u32)__builtin_clzll(x);
+#endif
+}
+
+// CTZ (count trailing zero) implementations.
+static really_inline
+u32 ctz32(u32 x) {
+    assert(x); // behaviour not defined for x == 0
+#if defined(_WIN32)
+    unsigned long r;
+    _BitScanForward(&r, x);
+    return r;
+#else
+    return (u32)__builtin_ctz(x);
+#endif
+}
+
+static really_inline
+u32 ctz64(u64a x) {
+    assert(x); // behaviour not defined for x == 0
+#if defined(_WIN64)
+    unsigned long r;
+    _BitScanForward64(&r, x);
+    return r;
+#elif defined(_WIN32)
+    unsigned long r;
+    if (_BitScanForward(&r, (u32)x)) {
+        return (u32)r;
+    }
+    _BitScanForward(&r, x >> 32);
+    return (u32)(r + 32);
+#else
+    return (u32)__builtin_ctzll(x);
+#endif
+}
+
+static really_inline
+u32 lg2(u32 x) {
+    if (!x) {
+        return 0;
+    }
+    return 31 - clz32(x);
+}
+
+static really_inline
+u64a lg2_64(u64a x) {
+    if (!x) {
+        return 0;
+    }
+    return 63 - clz64(x);
+}
+
+static really_inline
+u32 findAndClearLSB_32(u32 *v) {
+    assert(*v != 0); // behaviour not defined in this case
+#ifndef NO_ASM
+    u32 val = *v, offset;
+    __asm__ ("bsf %1, %0\n"
+             "btr %0, %1\n"
+             : "=r" (offset), "=r" (val)
+             : "1" (val));
+    *v = val;
+#else
+    u32 val = *v;
+    u32 offset = ctz32(val);
+    *v = val & (val - 1);
+#endif
+
+    assert(offset < 32);
+    return offset;
+}
+
+static really_inline
+u32 findAndClearLSB_64(u64a *v) {
+    assert(*v != 0); // behaviour not defined in this case
+
+#ifdef ARCH_64_BIT
+#if defined(ARCH_X86_64) && !defined(NO_ASM)
+    u64a val = *v, offset;
+    __asm__ ("bsfq %1, %0\n"
+             "btrq %0, %1\n"
+             : "=r" (offset), "=r" (val)
+             : "1" (val));
+    *v = val;
+#else
+    // generic variant using gcc's builtin on 64-bit
+    u64a val = *v, offset;
+    offset = ctz64(val);
+    *v = val & (val - 1);
+#endif // ARCH_X86_64
+#else
+    // fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't
+    // inline calls to __builtin_ctzll
+    u32 v1 = (u32)*v;
+    u32 v2 = (u32)(*v >> 32);
+    u32 offset;
+    if (v1) {
+        offset = findAndClearLSB_32(&v1);
+        *v = (u64a)v1 | ((u64a)v2 << 32);
+    } else {
+        offset = findAndClearLSB_32(&v2) + 32;
+        *v = (u64a)v2 << 32;
+    }
+#endif
+
+    assert(offset < 64);
+    return (u32)offset;
+}
+
+static really_inline
+u32 findAndClearMSB_32(u32 *v) {
+    assert(*v != 0); // behaviour not defined in this case
+#ifndef NO_ASM
+    u32 val = *v, offset;
+    __asm__ ("bsr %1, %0\n"
+             "btr %0, %1\n"
+             : "=r" (offset), "=r" (val)
+             : "1" (val));
+    *v = val;
+#else
+    u32 val = *v;
+    u32 offset = 31 - clz32(val);
+    *v = val & ~(1 << offset);
+#endif
+    assert(offset < 32);
+    return offset;
+}
+
+static really_inline
+u32 findAndClearMSB_64(u64a *v) {
+    assert(*v != 0); // behaviour not defined in this case
+
+#ifdef ARCH_64_BIT
+#if defined(ARCH_X86_64) && !defined(NO_ASM)
+    u64a val = *v, offset;
+    __asm__ ("bsrq %1, %0\n"
+             "btrq %0, %1\n"
+             : "=r" (offset), "=r" (val)
+             : "1" (val));
+    *v = val;
+#else
+    // generic variant using gcc's builtin on 64-bit
+    u64a val = *v, offset;
+    offset = 63 - clz64(val);
+    *v = val & ~(1ULL << offset);
+#endif // ARCH_X86_64
+#else
+    // fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't
+    // inline calls to __builtin_ctzll
+    u32 v1 = (u32)*v;
+    u32 v2 = (*v >> 32);
+    u32 offset;
+    if (v2) {
+        offset = findAndClearMSB_32(&v2) + 32;
+        *v = ((u64a)v2 << 32) | (u64a)v1;
+    } else {
+        offset = findAndClearMSB_32(&v1);
+        *v = (u64a)v1;
+    }
+#endif
+
+    assert(offset < 64);
+    return (u32)offset;
+}
+
+static really_inline
+u32 compress32(u32 x, u32 m) {
+#if defined(HAVE_BMI2)
+    // BMI2 has a single instruction for this operation.
+    return _pext_u32(x, m);
+#else
+
+    // Return zero quickly on trivial cases
+    if ((x & m) == 0) {
+        return 0;
+    }
+
+    u32 mk, mp, mv, t;
+
+    x &= m; // clear irrelevant bits
+
+    mk = ~m << 1; // we will count 0's to right
+    for (u32 i = 0; i < 5; i++) {
+        mp = mk ^ (mk << 1);
+        mp ^= mp << 2;
+        mp ^= mp << 4;
+        mp ^= mp << 8;
+        mp ^= mp << 16;
+
+        mv = mp & m; // bits to move
+        m = (m ^ mv) | (mv >> (1 << i)); // compress m
+        t = x & mv;
+        x = (x ^ t) | (t >> (1 << i)); // compress x
+        mk = mk & ~mp;
+    }
+
+    return x;
+#endif
+}
+
+static really_inline
+u64a compress64(u64a x, u64a m) {
+#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
+    // BMI2 has a single instruction for this operation.
+    return _pext_u64(x, m);
+#else
+
+    // Return zero quickly on trivial cases
+    if ((x & m) == 0) {
+        return 0;
+    }
+
+    u64a mk, mp, mv, t;
+
+    x &= m; // clear irrelevant bits
+
+    mk = ~m << 1; // we will count 0's to right
+    for (u32 i = 0; i < 6; i++) {
+        mp = mk ^ (mk << 1);
+        mp ^= mp << 2;
+        mp ^= mp << 4;
+        mp ^= mp << 8;
+        mp ^= mp << 16;
+        mp ^= mp << 32;
+
+        mv = mp & m; // bits to move
+        m = (m ^ mv) | (mv >> (1 << i)); // compress m
+        t = x & mv;
+        x = (x ^ t) | (t >> (1 << i)); // compress x
+        mk = mk & ~mp;
+    }
+
+    return x;
+#endif
+}
+
+static really_inline
+u32 expand32(u32 x, u32 m) {
+#if defined(HAVE_BMI2)
+    // BMI2 has a single instruction for this operation.
+    return _pdep_u32(x, m);
+#else
+
+    // Return zero quickly on trivial cases
+    if (!x || !m) {
+        return 0;
+    }
+
+    u32 m0, mk, mp, mv, t;
+    u32 array[5];
+
+    m0 = m; // save original mask
+    mk = ~m << 1; // we will count 0's to right
+
+    for (int i = 0; i < 5; i++) {
+        mp = mk ^ (mk << 1); // parallel suffix
+        mp = mp ^ (mp << 2);
+        mp = mp ^ (mp << 4);
+        mp = mp ^ (mp << 8);
+        mp = mp ^ (mp << 16);
+        mv = mp & m; // bits to move
+        array[i] = mv;
+        m = (m ^ mv) | (mv >> (1 << i)); // compress m
+        mk = mk & ~mp;
+    }
+
+    for (int i = 4; i >= 0; i--) {
+        mv = array[i];
+        t = x << (1 << i);
+        x = (x & ~mv) | (t & mv);
+    }
+
+    return x & m0; // clear out extraneous bits
+#endif
+}
+
+static really_inline
+u64a expand64(u64a x, u64a m) {
+#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
+    // BMI2 has a single instruction for this operation.
+    return _pdep_u64(x, m);
+#else
+
+    // Return zero quickly on trivial cases
+    if (!x || !m) {
+        return 0;
+    }
+
+    u64a m0, mk, mp, mv, t;
+    u64a array[6];
+
+    m0 = m; // save original mask
+    mk = ~m << 1; // we will count 0's to right
+
+    for (int i = 0; i < 6; i++) {
+        mp = mk ^ (mk << 1); // parallel suffix
+        mp = mp ^ (mp << 2);
+        mp = mp ^ (mp << 4);
+        mp = mp ^ (mp << 8);
+        mp = mp ^ (mp << 16);
+        mp = mp ^ (mp << 32);
+        mv = mp & m; // bits to move
+        array[i] = mv;
+        m = (m ^ mv) | (mv >> (1 << i)); // compress m
+        mk = mk & ~mp;
+    }
+
+    for (int i = 5; i >= 0; i--) {
+        mv = array[i];
+        t = x << (1 << i);
+        x = (x & ~mv) | (t & mv);
+    }
+
+    return x & m0; // clear out extraneous bits
+#endif
+}
+
+
+/* returns the first set bit after begin (if not ~0U). If no bit is set after
+ * begin returns ~0U
+ */
+static really_inline
+u32 bf64_iterate(u64a bitfield, u32 begin) {
+    if (begin != ~0U) {
+        /* switch off all bits at or below begin. Note: not legal to shift by
+         * by size of the datatype or larger. */
+        assert(begin <= 63);
+        bitfield &= ~((2ULL << begin) - 1);
+    }
+
+    if (!bitfield) {
+        return ~0U;
+    }
+
+    return ctz64(bitfield);
+}
+
+static really_inline
+char bf64_set(u64a *bitfield, u32 i) {
+    assert(i < 64);
+    u64a mask = 1ULL << i;
+    char was_set = !!(*bitfield & mask);
+    *bitfield |= mask;
+
+    return was_set;
+}
+
+static really_inline
+void bf64_unset(u64a *bitfield, u32 i) {
+    assert(i < 64);
+    *bitfield &= ~(1ULL << i);
+}
+
+static really_inline
+u32 rank_in_mask32(u32 mask, u32 bit) {
+    assert(bit < sizeof(u32) * 8);
+    assert(mask & (u32)(1U << bit));
+    mask &= (u32)(1U << bit) - 1;
+    return popcount32(mask);
+}
+
+static really_inline
+u32 rank_in_mask64(u64a mask, u32 bit) {
+    assert(bit < sizeof(u64a) * 8);
+    assert(mask & (u64a)(1ULL << bit));
+    mask &= (u64a)(1ULL << bit) - 1;
+    return popcount64(mask);
+}
+
+static really_inline
+u32 pext32(u32 x, u32 mask) {
+#if defined(HAVE_BMI2)
+    // Intel BMI2 can do this operation in one instruction.
+    return _pext_u32(x, mask);
+#else
+
+    u32 result = 0, num = 1;
+    while (mask != 0) {
+        u32 bit = findAndClearLSB_32(&mask);
+        if (x & (1U << bit)) {
+            assert(num != 0); // more than 32 bits!
+            result |= num;
+        }
+        num <<= 1;
+    }
+    return result;
+#endif
+}
+
+static really_inline
+u64a pext64(u64a x, u64a mask) {
+#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
+    // Intel BMI2 can do this operation in one instruction.
+    return _pext_u64(x, mask);
+#else
+
+    u32 result = 0, num = 1;
+    while (mask != 0) {
+        u32 bit = findAndClearLSB_64(&mask);
+        if (x & (1ULL << bit)) {
+            assert(num != 0); // more than 32 bits!
+            result |= num;
+        }
+        num <<= 1;
+    }
+    return result;
+#endif
+}
+
+#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
+static really_inline
+u64a pdep64(u64a x, u64a mask) {
+    return _pdep_u64(x, mask);
+}
+#endif
+
+#endif // BITUTILS_H
diff --git a/regex/util/compare.h b/regex/util/compare.h
new file mode 100644
index 000000000..eaa717a4c
--- /dev/null
+++ b/regex/util/compare.h
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef COMPARE_H
+#define COMPARE_H
+
+#include "unaligned.h"
+#include "ue2common.h"
+
+/* Our own definitions of tolower, toupper and isalpha are provided to prevent
+ * us from going out to libc for these tests. */
+
+static really_inline
+char myisupper(const char c) {
+    return ((c >= 'A') && (c <= 'Z'));
+}
+
+static really_inline
+char myislower(const char c) {
+    return ((c >= 'a') && (c <= 'z'));
+}
+
+static really_inline
+char mytolower(const char c) {
+    if (myisupper(c)) {
+        return c + 0x20;
+    }
+    return c;
+}
+
+static really_inline
+char mytoupper(const char c) {
+    if (myislower(c)) {
+        return c - 0x20;
+    }
+    return c;
+}
+
+/* this is a slightly warped definition of `alpha'. What we really
+ * mean is: does this character have different uppercase and lowercase forms?
+ */
+static really_inline char ourisalpha(const char c) {
+    return mytolower(c) != mytoupper(c);
+}
+
+static really_inline char ourisprint(const char c) {
+    return c >= 0x20 && c <= 0x7e;
+}
+
+// Paul Hsieh's SWAR toupper; used because it doesn't
+// matter whether we go toupper or tolower. We should
+// probably change the other one
+static really_inline
+u32 theirtoupper32(const u32 x) {
+    u32 b = 0x80808080ul | x;
+    u32 c = b - 0x61616161ul;
+    u32 d = ~(b - 0x7b7b7b7bul);
+    u32 e = (c & d) & (~x & 0x80808080ul);
+    return x - (e >> 2);
+}
+
+// 64-bit variant.
+static really_inline
+u64a theirtoupper64(const u64a x) {
+    u64a b = 0x8080808080808080ull | x;
+    u64a c = b - 0x6161616161616161ull;
+    u64a d = ~(b - 0x7b7b7b7b7b7b7b7bull);
+    u64a e = (c & d) & (~x & 0x8080808080808080ull);
+    u64a v = x - (e >> 2);
+    return v;
+}
+
+static really_inline
+int cmpNocaseNaive(const u8 *p1, const u8 *p2, size_t len) {
+    const u8 *pEnd = p1 + len;
+    for (; p1 < pEnd; p1++, p2++) {
+        assert(!ourisalpha(*p2) || myisupper(*p2)); // Already upper-case.
+        if ((u8)mytoupper(*p1) != *p2) {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+static really_inline
+int cmpCaseNaive(const u8 *p1, const u8 *p2, size_t len) {
+    const u8 *pEnd = p1 + len;
+    for (; p1 < pEnd; p1++, p2++) {
+        if (*p1 != *p2) {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+#ifdef ARCH_64_BIT
+#  define CMP_T        u64a
+#  define ULOAD(x)     unaligned_load_u64a(x)
+#  define TOUPPER(x)   theirtoupper64(x)
+#else
+#  define CMP_T        u32
+#  define ULOAD(x)     unaligned_load_u32(x)
+#  define TOUPPER(x)   theirtoupper32(x)
+#endif
+
+#define CMP_SIZE sizeof(CMP_T)
+
+/**
+ * \brief Compare two strings, optionally caselessly.
+ *
+ * Note: If nocase is true, p2 is assumed to be already upper-case.
+ */
+#if defined(ARCH_IA32)
+static UNUSED never_inline
+#else
+static really_inline
+#endif
+int cmpForward(const u8 *p1, const u8 *p2, size_t len, char nocase) {
+    if (len < CMP_SIZE) {
+        return nocase ? cmpNocaseNaive(p1, p2, len)
+                      : cmpCaseNaive(p1, p2, len);
+    }
+
+    const u8 *p1_end = p1 + len - CMP_SIZE;
+    const u8 *p2_end = p2 + len - CMP_SIZE;
+
+    if (nocase) { // Case-insensitive version.
+        for (; p1 < p1_end; p1 += CMP_SIZE, p2 += CMP_SIZE) {
+            assert(ULOAD(p2) == TOUPPER(ULOAD(p2))); // Already upper-case.
+            if (TOUPPER(ULOAD(p1)) != ULOAD(p2)) {
+                return 1;
+            }
+        }
+        assert(ULOAD(p2_end) == TOUPPER(ULOAD(p2_end))); // Already upper-case.
+        if (TOUPPER(ULOAD(p1_end)) != ULOAD(p2_end)) {
+            return 1;
+        }
+    } else { // Case-sensitive version.
+        for (; p1 < p1_end; p1 += CMP_SIZE, p2 += CMP_SIZE) {
+            if (ULOAD(p1) != ULOAD(p2)) {
+                return 1;
+            }
+        }
+        if (ULOAD(p1_end) != ULOAD(p2_end)) {
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
+#undef CMP_T
+#undef ULOAD
+#undef TOUPPER
+#undef CMP_SIZE
+
+#endif
+
diff --git a/regex/util/copybytes.h b/regex/util/copybytes.h
new file mode 100644
index 000000000..7f37d96bc
--- /dev/null
+++ b/regex/util/copybytes.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2016-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef COPY_BYTES_H
+#define COPY_BYTES_H
+
+#include "unaligned.h"
+#include "simd_utils.h"
+
+static really_inline
+void copy_upto_64_bytes(u8 *dst, const u8 *src, unsigned int len) {
+    switch (len) {
+    case 0:
+        break;
+    case 1:
+        *dst = *src;
+        break;
+    case 2:
+        unaligned_store_u16(dst, unaligned_load_u16(src));
+        break;
+    case 3:
+        unaligned_store_u16(dst, unaligned_load_u16(src));
+        dst[2] = src[2];
+        break;
+    case 4:
+        unaligned_store_u32(dst, unaligned_load_u32(src));
+        break;
+    case 5:
+    case 6:
+    case 7:
+        unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4));
+        unaligned_store_u32(dst, unaligned_load_u32(src));
+        break;
+    case 8:
+        unaligned_store_u64a(dst, unaligned_load_u64a(src));
+        break;
+    case 9:
+    case 10:
+    case 11:
+    case 12:
+    case 13:
+    case 14:
+    case 15:
+        unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8));
+        unaligned_store_u64a(dst, unaligned_load_u64a(src));
+        break;
+    case 16:
+        storeu128(dst, loadu128(src));
+        break;
+    case 17:
+    case 18:
+    case 19:
+    case 20:
+    case 21:
+    case 22:
+    case 23:
+    case 24:
+    case 25:
+    case 26:
+    case 27:
+    case 28:
+    case 29:
+    case 30:
+    case 31:
+        storeu128(dst + len - 16, loadu128(src + len - 16));
+        storeu128(dst, loadu128(src));
+        break;
+    case 32:
+        storeu256(dst, loadu256(src));
+        break;
+#ifdef HAVE_AVX512
+    case 64:
+        storebytes512(dst, loadu512(src), 64);
+        break;
+    default:
+        assert(len < 64);
+        u64a k = (1ULL << len) - 1;
+        storeu_mask_m512(dst, k, loadu_maskz_m512(k, src));
+        break;
+#else
+    default:
+        assert(0);
+        break;
+#endif
+    }
+}
+
+#endif
diff --git a/regex/util/cpuid_flags.c b/regex/util/cpuid_flags.c
new file mode 100644
index 000000000..c00ce58e2
--- /dev/null
+++ b/regex/util/cpuid_flags.c
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2015-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "cpuid_flags.h"
+#include "cpuid_inline.h"
+#include "ue2common.h"
+#include "hs_compile.h" // for HS_MODE_ flags
+#include "hs_internal.h"
+#include "util/arch.h"
+
+#if !defined(_WIN32) && !defined(CPUID_H_)
+#include <cpuid.h>
+#endif
+
+u64a cpuid_flags(void) {
+    u64a cap = 0;
+
+    if (check_avx2()) {
+        DEBUG_PRINTF("AVX2 enabled\n");
+        cap |= HS_CPU_FEATURES_AVX2;
+    }
+
+    if (check_avx512()) {
+        DEBUG_PRINTF("AVX512 enabled\n");
+        cap |= HS_CPU_FEATURES_AVX512;
+    }
+
+    if (check_avx512vbmi()) {
+        DEBUG_PRINTF("AVX512VBMI enabled\n");
+        cap |= HS_CPU_FEATURES_AVX512VBMI;
+    }
+
+#if !defined(FAT_RUNTIME) && !defined(HAVE_AVX2)
+    cap &= ~HS_CPU_FEATURES_AVX2;
+#endif
+
+#if (!defined(FAT_RUNTIME) && !defined(HAVE_AVX512)) ||                        \
+    (defined(FAT_RUNTIME) && !defined(BUILD_AVX512))
+    cap &= ~HS_CPU_FEATURES_AVX512;
+#endif
+
+#if (!defined(FAT_RUNTIME) && !defined(HAVE_AVX512VBMI)) ||                    \
+    (defined(FAT_RUNTIME) && !defined(BUILD_AVX512VBMI))
+    cap &= ~HS_CPU_FEATURES_AVX512VBMI;
+#endif
+
+    return cap;
+}
+
+struct family_id {
+    u32 full_family;
+    u32 full_model;
+    u32 tune;
+};
+
+/* from table 35-1 of the Intel 64 and IA32 Arch. Software Developer's Manual
+ * and "Intel Architecture and Processor Identification With CPUID Model and
+ * Family Numbers" */
+static const struct family_id known_microarch[] = {
+    { 0x6, 0x37, HS_TUNE_FAMILY_SLM }, /* baytrail */
+    { 0x6, 0x4A, HS_TUNE_FAMILY_SLM }, /* silvermont */
+    { 0x6, 0x4C, HS_TUNE_FAMILY_SLM }, /* silvermont */
+    { 0x6, 0x4D, HS_TUNE_FAMILY_SLM }, /* avoton, rangley */
+    { 0x6, 0x5A, HS_TUNE_FAMILY_SLM }, /* silvermont */
+    { 0x6, 0x5D, HS_TUNE_FAMILY_SLM }, /* silvermont */
+
+    { 0x6, 0x5C, HS_TUNE_FAMILY_GLM }, /* goldmont */
+    { 0x6, 0x5F, HS_TUNE_FAMILY_GLM }, /* denverton */
+
+    { 0x6, 0x3C, HS_TUNE_FAMILY_HSW }, /* haswell */
+    { 0x6, 0x45, HS_TUNE_FAMILY_HSW }, /* haswell */
+    { 0x6, 0x46, HS_TUNE_FAMILY_HSW }, /* haswell */
+    { 0x6, 0x3F, HS_TUNE_FAMILY_HSW }, /* haswell Xeon */
+
+    { 0x6, 0x3E, HS_TUNE_FAMILY_IVB }, /* ivybridge Xeon */
+    { 0x6, 0x3A, HS_TUNE_FAMILY_IVB }, /* ivybridge */
+
+    { 0x6, 0x2A, HS_TUNE_FAMILY_SNB }, /* sandybridge */
+    { 0x6, 0x2D, HS_TUNE_FAMILY_SNB }, /* sandybridge Xeon */
+
+    { 0x6, 0x3D, HS_TUNE_FAMILY_BDW }, /* broadwell Core-M */
+    { 0x6, 0x47, HS_TUNE_FAMILY_BDW }, /* broadwell */
+    { 0x6, 0x4F, HS_TUNE_FAMILY_BDW }, /* broadwell xeon */
+    { 0x6, 0x56, HS_TUNE_FAMILY_BDW }, /* broadwell xeon-d */
+
+    { 0x6, 0x4E, HS_TUNE_FAMILY_SKL }, /* Skylake Mobile */
+    { 0x6, 0x5E, HS_TUNE_FAMILY_SKL }, /* Skylake Core/E3 Xeon */
+    { 0x6, 0x55, HS_TUNE_FAMILY_SKX }, /* Skylake Xeon */
+
+    { 0x6, 0x8E, HS_TUNE_FAMILY_SKL }, /* Kabylake Mobile */
+    { 0x6, 0x9E, HS_TUNE_FAMILY_SKL }, /* Kabylake desktop */
+
+    { 0x6, 0x7D, HS_TUNE_FAMILY_ICL }, /* Icelake */
+    { 0x6, 0x7E, HS_TUNE_FAMILY_ICL }, /* Icelake */
+    { 0x6, 0x6A, HS_TUNE_FAMILY_ICX }, /* Icelake Xeon-D */
+    { 0x6, 0x6C, HS_TUNE_FAMILY_ICX }, /* Icelake Xeon */
+
+};
+
+#ifdef DUMP_SUPPORT
+static UNUSED
+const char *dumpTune(u32 tune) {
+#define T_CASE(x) case x: return #x;
+    switch (tune) {
+        T_CASE(HS_TUNE_FAMILY_SLM);
+        T_CASE(HS_TUNE_FAMILY_GLM);
+        T_CASE(HS_TUNE_FAMILY_HSW);
+        T_CASE(HS_TUNE_FAMILY_SNB);
+        T_CASE(HS_TUNE_FAMILY_IVB);
+        T_CASE(HS_TUNE_FAMILY_BDW);
+        T_CASE(HS_TUNE_FAMILY_SKL);
+        T_CASE(HS_TUNE_FAMILY_SKX);
+        T_CASE(HS_TUNE_FAMILY_ICL);
+        T_CASE(HS_TUNE_FAMILY_ICX);
+    }
+#undef T_CASE
+    return "unknown";
+}
+#endif
+
+u32 cpuid_tune(void) {
+    unsigned int eax, ebx, ecx, edx;
+
+    cpuid(1, 0, &eax, &ebx, &ecx, &edx);
+
+    u32 family = (eax >> 8) & 0xf;
+    u32 model = 0;
+
+    if (family == 0x6 || family == 0xf) {
+        model = ((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0);
+    } else {
+        model = (eax >> 4) & 0xf;
+    }
+
+    DEBUG_PRINTF("family = %xh model = %xh\n", family, model);
+    for (u32 i = 0; i < ARRAY_LENGTH(known_microarch); i++) {
+        if (family != known_microarch[i].full_family) {
+            continue;
+        }
+
+        if (model != known_microarch[i].full_model) {
+            continue;
+        }
+
+        u32 tune = known_microarch[i].tune;
+        DEBUG_PRINTF("found tune flag %s\n", dumpTune(tune) );
+        return tune;
+    }
+
+    return HS_TUNE_FAMILY_GENERIC;
+}
diff --git a/regex/util/cpuid_flags.h b/regex/util/cpuid_flags.h
new file mode 100644
index 000000000..527c6d52f
--- /dev/null
+++ b/regex/util/cpuid_flags.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef UTIL_CPUID_H_
+#define UTIL_CPUID_H_
+
+#include "ue2common.h"
+
+#if !defined(_WIN32) && !defined(CPUID_H_)
+#include <cpuid.h>
+ /* system header doesn't have a header guard */
+#define CPUID_H_
+#endif
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/* returns HS_CPU_FEATURES_* mask.  */
+u64a cpuid_flags(void);
+
+u32 cpuid_tune(void);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* UTIL_CPUID_H_ */
+
diff --git a/regex/util/cpuid_inline.h b/regex/util/cpuid_inline.h
new file mode 100644
index 000000000..b7b424528
--- /dev/null
+++ b/regex/util/cpuid_inline.h
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2017-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef CPUID_INLINE_H_
+#define CPUID_INLINE_H_
+
+#include "ue2common.h"
+#include "cpuid_flags.h"
+
+#if !defined(_WIN32) && !defined(CPUID_H_)
+#include <cpuid.h>
+/* system header doesn't have a header guard */
+#define CPUID_H_
+#endif
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+static inline
+void cpuid(unsigned int op, unsigned int leaf, unsigned int *eax,
+           unsigned int *ebx, unsigned int *ecx, unsigned int *edx) {
+#ifndef _WIN32
+    __cpuid_count(op, leaf, *eax, *ebx, *ecx, *edx);
+#else
+    int a[4];
+    __cpuidex(a, op, leaf);
+    *eax = a[0];
+    *ebx = a[1];
+    *ecx = a[2];
+    *edx = a[3];
+#endif
+}
+
+// ECX
+#define CPUID_SSE3 (1 << 0)
+#define CPUID_SSSE3 (1 << 9)
+#define CPUID_SSE4_1 (1 << 19)
+#define CPUID_SSE4_2 (1 << 20)
+#define CPUID_POPCNT (1 << 23)
+#define CPUID_XSAVE (1 << 27)
+#define CPUID_AVX (1 << 28)
+
+// EDX
+#define CPUID_FXSAVE (1 << 24)
+#define CPUID_SSE (1 << 25)
+#define CPUID_SSE2 (1 << 26)
+#define CPUID_HTT (1 << 28)
+
+// Structured Extended Feature Flags Enumeration Leaf ECX values
+#define CPUID_AVX512VBMI (1 << 1)
+
+// Structured Extended Feature Flags Enumeration Leaf EBX values
+#define CPUID_BMI (1 << 3)
+#define CPUID_AVX2 (1 << 5)
+#define CPUID_BMI2 (1 << 8)
+#define CPUID_AVX512F (1 << 16)
+#define CPUID_AVX512BW (1 << 30)
+
+// Extended Control Register 0 (XCR0) values
+#define CPUID_XCR0_SSE (1 << 1)
+#define CPUID_XCR0_AVX (1 << 2)
+#define CPUID_XCR0_OPMASK (1 << 5) // k-regs
+#define CPUID_XCR0_ZMM_Hi256 (1 << 6) // upper 256 bits of ZMM0-ZMM15
+#define CPUID_XCR0_Hi16_ZMM (1 << 7) // ZMM16-ZMM31
+
+#define CPUID_XCR0_AVX512                                                      \
+    (CPUID_XCR0_OPMASK | CPUID_XCR0_ZMM_Hi256 | CPUID_XCR0_Hi16_ZMM)
+
+static inline
+u64a xgetbv(u32 op) {
+#if defined(_WIN32) || defined(__INTEL_COMPILER)
+    return _xgetbv(op);
+#else
+    u32 a, d;
+    __asm__ volatile (
+            "xgetbv\n"
+            : "=a"(a),
+              "=d"(d)
+            : "c"(op));
+    return ((u64a)d << 32) + a;
+#endif
+}
+
+static inline
+int check_avx2(void) {
+#if defined(__INTEL_COMPILER)
+    return _may_i_use_cpu_feature(_FEATURE_AVX2);
+#else
+    unsigned int eax, ebx, ecx, edx;
+
+    cpuid(1, 0, &eax, &ebx, &ecx, &edx);
+
+    /* check AVX is supported and XGETBV is enabled by OS */
+    if ((ecx & (CPUID_AVX | CPUID_XSAVE)) != (CPUID_AVX | CPUID_XSAVE)) {
+        DEBUG_PRINTF("AVX and XSAVE not supported\n");
+        return 0;
+    }
+
+    /* check that SSE and AVX registers are enabled by OS */
+    u64a xcr0 = xgetbv(0);
+    if ((xcr0 & (CPUID_XCR0_SSE | CPUID_XCR0_AVX)) !=
+        (CPUID_XCR0_SSE | CPUID_XCR0_AVX)) {
+        DEBUG_PRINTF("SSE and AVX registers not enabled\n");
+        return 0;
+    }
+
+    /* ECX and EDX contain capability flags */
+    ecx = 0;
+    cpuid(7, 0, &eax, &ebx, &ecx, &edx);
+
+    if (ebx & CPUID_AVX2) {
+        DEBUG_PRINTF("AVX2 enabled\n");
+        return 1;
+    }
+
+    return 0;
+#endif
+}
+
+static inline
+int check_avx512(void) {
+    /*
+     * For our purposes, having avx512 really means "can we use AVX512BW?"
+     */
+#if defined(__INTEL_COMPILER)
+    return _may_i_use_cpu_feature(_FEATURE_AVX512BW | _FEATURE_AVX512VL);
+#else
+    unsigned int eax, ebx, ecx, edx;
+
+    cpuid(1, 0, &eax, &ebx, &ecx, &edx);
+
+    /* check XSAVE is enabled by OS */
+    if (!(ecx & CPUID_XSAVE)) {
+        DEBUG_PRINTF("AVX and XSAVE not supported\n");
+        return 0;
+    }
+
+    /* check that AVX 512 registers are enabled by OS */
+    u64a xcr0 = xgetbv(0);
+    if ((xcr0 & CPUID_XCR0_AVX512) != CPUID_XCR0_AVX512) {
+        DEBUG_PRINTF("AVX512 registers not enabled\n");
+        return 0;
+    }
+
+    /* ECX and EDX contain capability flags */
+    ecx = 0;
+    cpuid(7, 0, &eax, &ebx, &ecx, &edx);
+
+    if (!(ebx & CPUID_AVX512F)) {
+        DEBUG_PRINTF("AVX512F (AVX512 Foundation) instructions not enabled\n");
+        return 0;
+    }
+
+    if (ebx & CPUID_AVX512BW) {
+        DEBUG_PRINTF("AVX512BW instructions enabled\n");
+        return 1;
+    }
+
+    return 0;
+#endif
+}
+
+static inline
+int check_avx512vbmi(void) {
+#if defined(__INTEL_COMPILER)
+    return _may_i_use_cpu_feature(_FEATURE_AVX512VBMI);
+#else
+    unsigned int eax, ebx, ecx, edx;
+
+    cpuid(1, 0, &eax, &ebx, &ecx, &edx);
+
+    /* check XSAVE is enabled by OS */
+    if (!(ecx & CPUID_XSAVE)) {
+        DEBUG_PRINTF("AVX and XSAVE not supported\n");
+        return 0;
+    }
+
+    /* check that AVX 512 registers are enabled by OS */
+    u64a xcr0 = xgetbv(0);
+    if ((xcr0 & CPUID_XCR0_AVX512) != CPUID_XCR0_AVX512) {
+        DEBUG_PRINTF("AVX512 registers not enabled\n");
+        return 0;
+    }
+
+    /* ECX and EDX contain capability flags */
+    ecx = 0;
+    cpuid(7, 0, &eax, &ebx, &ecx, &edx);
+
+    if (!(ebx & CPUID_AVX512F)) {
+        DEBUG_PRINTF("AVX512F (AVX512 Foundation) instructions not enabled\n");
+        return 0;
+    }
+
+    if (!(ebx & CPUID_AVX512BW)) {
+        DEBUG_PRINTF("AVX512BW instructions not enabled\n");
+        return 0;
+    }
+
+    if (ecx & CPUID_AVX512VBMI) {
+        DEBUG_PRINTF("AVX512VBMI instructions enabled\n");
+        return 1;
+    }
+
+    return 0;
+#endif
+}
+
+static inline
+int check_ssse3(void) {
+    unsigned int eax, ebx, ecx, edx;
+    cpuid(1, 0, &eax, &ebx, &ecx, &edx);
+    return !!(ecx & CPUID_SSSE3);
+}
+
+static inline
+int check_sse42(void) {
+    unsigned int eax, ebx, ecx, edx;
+    cpuid(1, 0, &eax, &ebx, &ecx, &edx);
+    return !!(ecx & CPUID_SSE4_2);
+}
+
+static inline
+int check_popcnt(void) {
+    unsigned int eax, ebx, ecx, edx;
+    cpuid(1, 0, &eax, &ebx, &ecx, &edx);
+    return !!(ecx & CPUID_POPCNT);
+}
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* CPUID_INLINE_H_ */
diff --git a/regex/util/exhaust.h b/regex/util/exhaust.h
new file mode 100644
index 000000000..d6f2ac06d
--- /dev/null
+++ b/regex/util/exhaust.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Inline functions for manipulating exhaustion vector.
+ */
+
+#ifndef EXHAUST_H
+#define EXHAUST_H
+
+#include "ue2common.h"
+
+/** Index meaning a given exhaustion key is invalid. */
+#define INVALID_EKEY    (~(u32)0)
+
+#endif
diff --git a/regex/util/fatbit.h b/regex/util/fatbit.h
new file mode 100644
index 000000000..3c65db1a5
--- /dev/null
+++ b/regex/util/fatbit.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FATBIT_H
+#define FATBIT_H
+
+/** \file
+ * \brief Multibit: fast bitset structure for use in scratch.
+ * Uses more space than mmbit, to avoid partial words for hopefully a taddy more
+ * performance.
+ *
+ * API is also trimmed down.
+ */
+
+#include "multibit.h"
+#include "ue2common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MIN_FAT_SIZE 32
+
+struct fatbit {
+    union {
+        u64a flat[MIN_FAT_SIZE / sizeof(u64a)];
+        u8 raw[MIN_FAT_SIZE];
+    } fb_int;
+    u64a tail[];
+};
+
+static really_inline
+void fatbit_clear(struct fatbit *bits) {
+    assert(ISALIGNED(bits));
+    memset(bits, 0, sizeof(struct fatbit));
+}
+
+static really_inline
+char fatbit_set(struct fatbit *bits, u32 total_bits, u32 key) {
+    assert(ISALIGNED(bits));
+    return mmbit_set(bits->fb_int.raw, total_bits, key);
+}
+
+static really_inline
+void fatbit_unset(struct fatbit *bits, u32 total_bits, u32 key) {
+    assert(ISALIGNED(bits));
+     mmbit_unset(bits->fb_int.raw, total_bits, key);
+}
+
+static really_inline
+char fatbit_isset(const struct fatbit *bits, u32 total_bits, u32 key) {
+    assert(ISALIGNED(bits));
+    return mmbit_isset(bits->fb_int.raw, total_bits, key);
+}
+
+static really_inline
+u32 fatbit_iterate(const struct fatbit *bits, u32 total_bits, u32 it_in) {
+    assert(ISALIGNED(bits));
+    /* TODO: iterate_flat could be specialised as we don't have to worry about
+     * partial blocks. */
+    return mmbit_iterate(bits->fb_int.raw, total_bits, it_in);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif
diff --git a/regex/util/intrinsics.h b/regex/util/intrinsics.h
new file mode 100644
index 000000000..0156f9ed3
--- /dev/null
+++ b/regex/util/intrinsics.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Wrapper around the compiler supplied intrinsic header
+ */
+
+#ifndef INTRINSICS_H
+#define INTRINSICS_H
+
+#include "config.h"
+
+#ifdef __cplusplus
+# if defined(HAVE_CXX_X86INTRIN_H)
+#  define USE_X86INTRIN_H
+# endif
+#else // C
+# if defined(HAVE_C_X86INTRIN_H)
+#  define USE_X86INTRIN_H
+# endif
+#endif
+
+#ifdef __cplusplus
+# if defined(HAVE_CXX_INTRIN_H)
+#  define USE_INTRIN_H
+# endif
+#else // C
+# if defined(HAVE_C_INTRIN_H)
+#  define USE_INTRIN_H
+# endif
+#endif
+
+#if defined(USE_X86INTRIN_H)
+#ifdef __KERNEL__
+#define  _MM_MALLOC_H_INCLUDED
+#endif
+#include <x86intrin.h>
+#elif defined(USE_INTRIN_H)
+#include <intrin.h>
+#else
+#error no intrinsics file
+#endif
+
+#endif // INTRINSICS_H
diff --git a/regex/util/join.h b/regex/util/join.h
new file mode 100644
index 000000000..7d5a30c39
--- /dev/null
+++ b/regex/util/join.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef JOIN_H
+
+#define JOIN(x, y) JOIN_AGAIN(x, y)
+#define JOIN_AGAIN(x, y) x ## y
+
+#define JOIN3(x, y, z) JOIN_AGAIN3(x, y, z)
+#define JOIN_AGAIN3(x, y, z) x ## y ## z
+
+#define JOIN4(w, x, y, z) JOIN_AGAIN4(w, x, y, z)
+#define JOIN_AGAIN4(w, x, y, z) w ## x ## y ## z
+
+#endif
diff --git a/regex/util/logical.h b/regex/util/logical.h
new file mode 100644
index 000000000..0c8b6469a
--- /dev/null
+++ b/regex/util/logical.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2018, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Inline functions for manipulating logical combinations.
+ */
+
+#ifndef LOGICAL_H
+#define LOGICAL_H
+
+#include "ue2common.h"
+
+/** Index meaning a given logical key is invalid. */
+#define INVALID_LKEY    (~(u32)0)
+#define INVALID_CKEY    INVALID_LKEY
+
+/** Logical operation type, the priority is from high to low. */
+enum LogicalOpType {
+    LOGICAL_OP_NOT,
+    LOGICAL_OP_AND,
+    LOGICAL_OP_OR,
+    LAST_LOGICAL_OP =  LOGICAL_OP_OR //!< Sentinel.
+};
+
+#define UNKNOWN_OP      (~(u32)0)
+
+/** Logical Operation is consist of 4 parts. */
+struct LogicalOp {
+    u32 id; //!< logical operator/operation id
+    u32 op; //!< LogicalOpType
+    u32 lo; //!< left operand
+    u32 ro; //!< right operand
+};
+
+/** Each logical combination has its info:
+ * It occupies a region in LogicalOp vector.
+ * It has an exhaustion key for single-match mode. */
+struct CombInfo {
+    u32 id;
+    u32 ekey; //!< exhaustion key
+    u32 start; //!< ckey of logical operation to start calculating
+    u32 result; //!< ckey of logical operation to give final result
+    u64a min_offset;
+    u64a max_offset;
+};
+
+/** Temporarily use to seperate operations' id from reports' lkey
+  * when building logicalTree in shunting yard algorithm,
+  * operations' id will be finally renumbered following reports' lkey. */
+#define LOGICAL_OP_BIT 0x80000000UL
+
+#endif
diff --git a/regex/util/masked_move.c b/regex/util/masked_move.c
new file mode 100644
index 000000000..001cd49f2
--- /dev/null
+++ b/regex/util/masked_move.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include "ue2common.h"
+#include "masked_move.h"
+#include "util/arch.h"
+
+#if defined(HAVE_AVX2)
+/* masks for masked moves */
+
+/* magic mask for maskload (vmmaskmovq) - described in UE-2424 */
+const ALIGN_CL_DIRECTIVE u32 mm_mask_mask[16] = {
+    0x00000000U,
+    0x00000000U,
+    0x00000000U,
+    0x00000000U,
+    0x00000000U,
+    0x00000000U,
+    0x00000000U,
+    0x00000000U,
+    0xff000000U,
+    0xfe000000U,
+    0xfc000000U,
+    0xf8000000U,
+    0xf0000000U,
+    0xe0000000U,
+    0xc0000000U,
+    0x80000000U,
+};
+
+const u32 mm_shuffle_end[32][8] = {
+    { 0x03020100U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, },
+    { 0x02010080U, 0x80808003U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, },
+    { 0x01008080U, 0x80800302U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, },
+    { 0x00808080U, 0x80030201U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, },
+    { 0x80808080U, 0x03020100U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, },
+    { 0x80808080U, 0x02010080U, 0x80808003U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, },
+    { 0x80808080U, 0x01008080U, 0x80800302U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, },
+    { 0x80808080U, 0x00808080U, 0x80030201U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, },
+    { 0x80808080U, 0x80808080U, 0x03020100U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, },
+    { 0x80808080U, 0x80808080U, 0x02010080U, 0x80808003U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, },
+    { 0x80808080U, 0x80808080U, 0x01008080U, 0x80800302U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, },
+    { 0x80808080U, 0x80808080U, 0x00808080U, 0x80030201U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, },
+    { 0x80808080U, 0x80808080U, 0x80808080U, 0x03020100U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, },
+    { 0x80808080U, 0x80808080U, 0x80808080U, 0x02010080U, 0x80808003U, 0x80808080U, 0x80808080U, 0x80808080U, },
+    { 0x80808080U, 0x80808080U, 0x80808080U, 0x01008080U, 0x80800302U, 0x80808080U, 0x80808080U, 0x80808080U, },
+    { 0x80808080U, 0x80808080U, 0x80808080U, 0x00808080U, 0x80030201U, 0x80808080U, 0x80808080U, 0x80808080U, },
+    { 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x03020100U, 0x80808080U, 0x80808080U, 0x80808080U, },
+    { 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x02010080U, 0x80808003U, 0x80808080U, 0x80808080U, },
+    { 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x01008080U, 0x80800302U, 0x80808080U, 0x80808080U, },
+    { 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x00808080U, 0x80030201U, 0x80808080U, 0x80808080U, },
+    { 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x03020100U, 0x80808080U, 0x80808080U, },
+    { 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x02010080U, 0x80808003U, 0x80808080U, },
+    { 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x01008080U, 0x80800302U, 0x80808080U, },
+    { 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x00808080U, 0x80030201U, 0x80808080U, },
+    { 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x03020100U, 0x80808080U, },
+    { 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x02010080U, 0x80808003U, },
+    { 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x01008080U, 0x80800302U, },
+    { 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x00808080U, 0x80030201U, },
+    { 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x03020100U, },
+    { 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x02010080U, },
+    { 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x01008080U, },
+    { 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x00808080U, },
+};
+#endif // AVX2
diff --git a/regex/util/masked_move.h b/regex/util/masked_move.h
new file mode 100644
index 000000000..4c877ca9e
--- /dev/null
+++ b/regex/util/masked_move.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef MASKED_MOVE_H
+#define MASKED_MOVE_H
+
+#include "arch.h"
+
+#if defined(HAVE_AVX2)
+
+#include "unaligned.h"
+#include "simd_utils.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern const u32 mm_mask_mask[16];
+extern const u32 mm_shuffle_end[32][8];
+#ifdef __cplusplus
+}
+#endif
+
+/* load mask for len bytes from start of buffer */
+static really_inline m256
+_get_mm_mask_end(u32 len) {
+    assert(len <= 32);
+    const u8 *masky = (const u8 *)mm_mask_mask;
+    m256 mask = load256(masky + 32);
+    mask = _mm256_sll_epi32(mask, _mm_cvtsi32_si128(8 - (len >> 2)));
+    return mask;
+}
+
+/*
+ * masked_move256_len: Will load len bytes from *buf into m256
+ * _______________________________
+ * |0<----len---->|            32|
+ * -------------------------------
+ */
+static really_inline m256
+masked_move256_len(const u8 *buf, const u32 len) {
+    assert(len >= 4);
+
+    m256 lmask = _get_mm_mask_end(len);
+
+    u32 end = unaligned_load_u32(buf + len - 4);
+    m256 preshufend = _mm256_broadcastq_epi64(_mm_cvtsi32_si128(end));
+    m256 v = _mm256_maskload_epi32((const int *)buf, lmask);
+    m256 shufend = pshufb_m256(preshufend,
+                               loadu256(&mm_shuffle_end[len - 4]));
+    m256 target = or256(v, shufend);
+
+    return target;
+}
+
+#endif /* AVX2 */
+#endif /* MASKED_MOVE_H */
+
diff --git a/regex/util/multibit.c b/regex/util/multibit.c
new file mode 100644
index 000000000..de192d7dd
--- /dev/null
+++ b/regex/util/multibit.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Multibit: lookup tables and support code.
+ *
+ * This C file contains the constant tables used by multibit, so we don't end
+ * up creating copies of them for every unit that uses it.
+ */
+
+#include "multibit.h"
+#include "ue2common.h"
+
+const u8 mmbit_keyshift_lut[32] = {
+    30, 30, 24, 24, 24, 24, 24, 24, 18, 18, 18,
+    18, 18, 18, 12, 12, 12, 12, 12, 12, 6, 6,
+    6,  6,  6,  6,  0,  0,  0,  0,  0,  0
+};
+
+// The only actually valid values of ks are as shown in the LUT above, but a
+// division is just too expensive.
+const u8 mmbit_maxlevel_from_keyshift_lut[32] = {
+    0, 0, 0, 0, 0, 0,
+    1, 1, 1, 1, 1, 1,
+    2, 2, 2, 2, 2, 2,
+    3, 3, 3, 3, 3, 3,
+    4, 4, 4, 4, 4, 4,
+    5, 5
+};
+
+const u8 mmbit_maxlevel_direct_lut[32] = {
+    5, 5, 4, 4, 4, 4, 4, 4, 3, 3, 3,
+    3, 3, 3, 2, 2, 2, 2, 2, 2, 1, 1,
+    1, 1, 1, 1, 0, 0, 0, 0, 0, 0
+};
+
+#define ZERO_TO_LUT(x) ((1ULL << x) - 1)
+
+const u64a mmbit_zero_to_lut[65] = {
+    ZERO_TO_LUT(0),
+    ZERO_TO_LUT(1),
+    ZERO_TO_LUT(2),
+    ZERO_TO_LUT(3),
+    ZERO_TO_LUT(4),
+    ZERO_TO_LUT(5),
+    ZERO_TO_LUT(6),
+    ZERO_TO_LUT(7),
+    ZERO_TO_LUT(8),
+    ZERO_TO_LUT(9),
+    ZERO_TO_LUT(10),
+    ZERO_TO_LUT(11),
+    ZERO_TO_LUT(12),
+    ZERO_TO_LUT(13),
+    ZERO_TO_LUT(14),
+    ZERO_TO_LUT(15),
+    ZERO_TO_LUT(16),
+    ZERO_TO_LUT(17),
+    ZERO_TO_LUT(18),
+    ZERO_TO_LUT(19),
+    ZERO_TO_LUT(20),
+    ZERO_TO_LUT(21),
+    ZERO_TO_LUT(22),
+    ZERO_TO_LUT(23),
+    ZERO_TO_LUT(24),
+    ZERO_TO_LUT(25),
+    ZERO_TO_LUT(26),
+    ZERO_TO_LUT(27),
+    ZERO_TO_LUT(28),
+    ZERO_TO_LUT(29),
+    ZERO_TO_LUT(30),
+    ZERO_TO_LUT(31),
+    ZERO_TO_LUT(32),
+    ZERO_TO_LUT(33),
+    ZERO_TO_LUT(34),
+    ZERO_TO_LUT(35),
+    ZERO_TO_LUT(36),
+    ZERO_TO_LUT(37),
+    ZERO_TO_LUT(38),
+    ZERO_TO_LUT(39),
+    ZERO_TO_LUT(40),
+    ZERO_TO_LUT(41),
+    ZERO_TO_LUT(42),
+    ZERO_TO_LUT(43),
+    ZERO_TO_LUT(44),
+    ZERO_TO_LUT(45),
+    ZERO_TO_LUT(46),
+    ZERO_TO_LUT(47),
+    ZERO_TO_LUT(48),
+    ZERO_TO_LUT(49),
+    ZERO_TO_LUT(50),
+    ZERO_TO_LUT(51),
+    ZERO_TO_LUT(52),
+    ZERO_TO_LUT(53),
+    ZERO_TO_LUT(54),
+    ZERO_TO_LUT(55),
+    ZERO_TO_LUT(56),
+    ZERO_TO_LUT(57),
+    ZERO_TO_LUT(58),
+    ZERO_TO_LUT(59),
+    ZERO_TO_LUT(60),
+    ZERO_TO_LUT(61),
+    ZERO_TO_LUT(62),
+    ZERO_TO_LUT(63),
+    ~0ULL
+};
+
+const u32 mmbit_root_offset_from_level[7] = {
+    0,
+    1,
+    1 + (1 << MMB_KEY_SHIFT),
+    1 + (1 << MMB_KEY_SHIFT) + (1 << MMB_KEY_SHIFT * 2),
+    1 + (1 << MMB_KEY_SHIFT) + (1 << MMB_KEY_SHIFT * 2) + (1 << MMB_KEY_SHIFT * 3),
+    1 + (1 << MMB_KEY_SHIFT) + (1 << MMB_KEY_SHIFT * 2) + (1 << MMB_KEY_SHIFT * 3) + (1 << MMB_KEY_SHIFT * 4),
+    1 + (1 << MMB_KEY_SHIFT) + (1 << MMB_KEY_SHIFT * 2) + (1 << MMB_KEY_SHIFT * 3) + (1 << MMB_KEY_SHIFT * 4) + (1 << MMB_KEY_SHIFT * 5),
+};
diff --git a/regex/util/multibit.h b/regex/util/multibit.h
new file mode 100644
index 000000000..8697fb90a
--- /dev/null
+++ b/regex/util/multibit.h
@@ -0,0 +1,1506 @@
+/*
+ * Copyright (c) 2015-2018, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Multibit: fast bitset structure, main runtime.
+ *
+ * *Structure*
+ *
+ * For sizes <= MMB_FLAT_MAX_BITS, a flat bit vector is used, stored as N
+ * 64-bit blocks followed by one "runt block".
+ *
+ * In larger cases, we use a sequence of blocks forming a tree. Each bit in an
+ * internal block indicates whether its child block contains valid data. Every
+ * level bar the last is complete. The last level is just a basic bit vector.
+ *
+ * -----------------------------------------------------------------------------
+ * WARNING:
+ *
+ * mmbit code assumes that it is legal to load 8 bytes before the end of the
+ * mmbit. This means that for small mmbits (< 8byte), data may be read from
+ * before the base pointer. It is the user's responsibility to ensure that this
+ * is possible.
+ * -----------------------------------------------------------------------------
+ */
+#ifndef MULTIBIT_H
+#define MULTIBIT_H
+
+#include "config.h"
+#include "ue2common.h"
+#include "bitutils.h"
+#include "partial_store.h"
+#include "unaligned.h"
+#include "multibit_internal.h"
+
+#ifndef __KERNEL__
+#include <string.h>
+#else
+#include <linux/string.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MMB_ONE (1ULL)
+#define MMB_ALL_ONES (0xffffffffffffffffULL)
+
+/** \brief Number of bits in a block. */
+#define MMB_KEY_BITS (sizeof(MMB_TYPE) * 8)
+
+#define MMB_KEY_MASK (MMB_KEY_BITS - 1)
+
+// Key structure defines
+#define MMB_KEY_SHIFT 6
+
+/** \brief Max size of a flat multibit. */
+#define MMB_FLAT_MAX_BITS 256
+
+// Utility functions and data
+// see multibit.c for contents
+extern const u8 mmbit_keyshift_lut[32];
+extern const u8 mmbit_maxlevel_from_keyshift_lut[32];
+extern const u8 mmbit_maxlevel_direct_lut[32];
+extern const u32 mmbit_root_offset_from_level[7];
+extern const u64a mmbit_zero_to_lut[65];
+
+static really_inline
+MMB_TYPE mmb_load(const u8 * bits) {
+    return unaligned_load_u64a(bits);
+}
+
+static really_inline
+void mmb_store(u8 *bits, MMB_TYPE val) {
+    unaligned_store_u64a(bits, val);
+}
+
+static really_inline
+void mmb_store_partial(u8 *bits, MMB_TYPE val, u32 block_bits) {
+    assert(block_bits <= MMB_KEY_BITS);
+    partial_store_u64a(bits, val, ROUNDUP_N(block_bits, 8U) / 8U);
+}
+
+static really_inline
+MMB_TYPE mmb_single_bit(u32 bit) {
+    assert(bit < MMB_KEY_BITS);
+    return MMB_ONE << bit;
+}
+
+static really_inline
+MMB_TYPE mmb_mask_zero_to(u32 bit) {
+    assert(bit <= MMB_KEY_BITS);
+#ifdef ARCH_32_BIT
+    return mmbit_zero_to_lut[bit];
+#else
+    if (bit == MMB_KEY_BITS) {
+        return MMB_ALL_ONES;
+    } else {
+        return mmb_single_bit(bit) - MMB_ONE;
+    }
+#endif
+}
+
+/** \brief Returns a mask of set bits up to position \a bit. Does not handle
+ * the case where bit == MMB_KEY_BITS. */
+static really_inline
+MMB_TYPE mmb_mask_zero_to_nocheck(u32 bit) {
+    assert(bit < MMB_KEY_BITS);
+#ifdef ARCH_32_BIT
+    return mmbit_zero_to_lut[bit];
+#else
+    return mmb_single_bit(bit) - MMB_ONE;
+#endif
+}
+
+static really_inline
+u32 mmb_test(MMB_TYPE val, u32 bit) {
+    assert(bit < MMB_KEY_BITS);
+    return (val >> bit) & MMB_ONE;
+}
+
+static really_inline
+void mmb_set(MMB_TYPE * val, u32 bit) {
+    assert(bit < MMB_KEY_BITS);
+    *val |= mmb_single_bit(bit);
+}
+
+static really_inline
+void mmb_clear(MMB_TYPE * val, u32 bit) {
+    assert(bit < MMB_KEY_BITS);
+    *val &= ~mmb_single_bit(bit);
+}
+
+static really_inline
+u32 mmb_ctz(MMB_TYPE val) {
+    return ctz64(val);
+}
+
+static really_inline
+u32 mmb_popcount(MMB_TYPE val) {
+    return popcount64(val);
+}
+
+#ifndef MMMB_DEBUG
+#define MDEBUG_PRINTF(x, ...) do { } while(0)
+#else
+#define MDEBUG_PRINTF DEBUG_PRINTF
+#endif
+
+// Switch the following define on to trace writes to multibit.
+//#define MMB_TRACE_WRITES
+#ifdef MMB_TRACE_WRITES
+#define MMB_TRACE(format, ...)                                                 \
+    printf("mmb [%u bits @ %p] " format, total_bits, bits, ##__VA_ARGS__)
+#else
+#define MMB_TRACE(format, ...)                                                 \
+    do {                                                                       \
+    } while (0)
+#endif
+
+static really_inline
+u32 mmbit_keyshift(u32 total_bits) {
+    assert(total_bits > 1);
+    u32 n = clz32(total_bits - 1); // subtract one as we're rounding down
+    return mmbit_keyshift_lut[n];
+}
+
+static really_inline
+u32 mmbit_maxlevel(u32 total_bits) {
+    assert(total_bits > 1);
+    u32 n = clz32(total_bits - 1); // subtract one as we're rounding down
+    u32 max_level = mmbit_maxlevel_direct_lut[n];
+    assert(max_level <= MMB_MAX_LEVEL);
+    return max_level;
+}
+
+static really_inline
+u32 mmbit_maxlevel_from_keyshift(u32 ks) {
+    assert(ks <= 30);
+    assert(ks % MMB_KEY_SHIFT == 0);
+
+    u32 max_level = mmbit_maxlevel_from_keyshift_lut[ks];
+    assert(max_level <= MMB_MAX_LEVEL);
+    return max_level;
+}
+
+/** \brief get our keyshift for the current level */
+static really_inline
+u32 mmbit_get_ks(u32 max_level, u32 level) {
+    assert(max_level <= MMB_MAX_LEVEL);
+    assert(level <= max_level);
+    return (max_level - level) * MMB_KEY_SHIFT;
+}
+
+/** \brief get our key value for the current level */
+static really_inline
+u32 mmbit_get_key_val(u32 max_level, u32 level, u32 key) {
+    return (key >> mmbit_get_ks(max_level, level)) & MMB_KEY_MASK;
+}
+
+/** \brief get the level root for the current level */
+static really_inline
+u8 *mmbit_get_level_root(u8 *bits, u32 level) {
+    assert(level < ARRAY_LENGTH(mmbit_root_offset_from_level));
+    return bits + mmbit_root_offset_from_level[level] * sizeof(MMB_TYPE);
+}
+
+/** \brief get the level root for the current level as const */
+static really_inline
+const u8 *mmbit_get_level_root_const(const u8 *bits, u32 level) {
+    assert(level < ARRAY_LENGTH(mmbit_root_offset_from_level));
+    return bits + mmbit_root_offset_from_level[level] * sizeof(MMB_TYPE);
+}
+
+/** \brief get the block for this key on the current level as a u8 ptr */
+static really_inline
+u8 *mmbit_get_block_ptr(u8 *bits, u32 max_level, u32 level, u32 key) {
+    u8 *level_root = mmbit_get_level_root(bits, level);
+    u32 ks = mmbit_get_ks(max_level, level);
+    return level_root + ((u64a)key >> (ks + MMB_KEY_SHIFT)) * sizeof(MMB_TYPE);
+}
+
+/** \brief get the block for this key on the current level as a const u8 ptr */
+static really_inline
+const u8 *mmbit_get_block_ptr_const(const u8 *bits, u32 max_level, u32 level,
+                                    u32 key) {
+    const u8 *level_root = mmbit_get_level_root_const(bits, level);
+    u32 ks = mmbit_get_ks(max_level, level);
+    return level_root + ((u64a)key >> (ks + MMB_KEY_SHIFT)) * sizeof(MMB_TYPE);
+}
+
+/** \brief get the _byte_ for this key on the current level as a u8 ptr */
+static really_inline
+u8 *mmbit_get_byte_ptr(u8 *bits, u32 max_level, u32 level, u32 key) {
+    u8 *level_root = mmbit_get_level_root(bits, level);
+    u32 ks = mmbit_get_ks(max_level, level);
+    return level_root + ((u64a)key >> (ks + MMB_KEY_SHIFT - 3));
+}
+
+/** \brief get our key value for the current level */
+static really_inline
+u32 mmbit_get_key_val_byte(u32 max_level, u32 level, u32 key) {
+    return (key >> (mmbit_get_ks(max_level, level))) & 0x7;
+}
+
+/** \brief Load a flat bitvector block corresponding to N bits. */
+static really_inline
+MMB_TYPE mmbit_get_flat_block(const u8 *bits, u32 n_bits) {
+    assert(n_bits <= MMB_KEY_BITS);
+    u32 n_bytes = ROUNDUP_N(n_bits, 8) / 8;
+    switch (n_bytes) {
+    case 1:
+        return *bits;
+    case 2:
+        return unaligned_load_u16(bits);
+    case 3:
+    case 4: {
+        u32 rv;
+        assert(n_bytes <= sizeof(rv));
+        memcpy(&rv, bits + n_bytes - sizeof(rv), sizeof(rv));
+        rv >>= (sizeof(rv) - n_bytes) * 8; /* need to shift to get things in
+                                            * the right position and remove
+                                            * junk */
+        assert(rv == partial_load_u32(bits, n_bytes));
+        return rv;
+    }
+    default: {
+        u64a rv;
+        assert(n_bytes <= sizeof(rv));
+        memcpy(&rv, bits + n_bytes - sizeof(rv), sizeof(rv));
+        rv >>= (sizeof(rv) - n_bytes) * 8; /* need to shift to get things in
+                                            * the right position and remove
+                                            * junk */
+        assert(rv == partial_load_u64a(bits, n_bytes));
+        return rv;
+    }
+    }
+}
+
+/** \brief True if this multibit is small enough to use a flat model */
+static really_inline
+u32 mmbit_is_flat_model(u32 total_bits) {
+    return total_bits <= MMB_FLAT_MAX_BITS;
+}
+
+static really_inline
+u32 mmbit_flat_size(u32 total_bits) {
+    assert(mmbit_is_flat_model(total_bits));
+    return ROUNDUP_N(total_bits, 8) / 8;
+}
+
+static really_inline
+u32 mmbit_flat_select_byte(u32 key, UNUSED u32 total_bits) {
+    return key / 8;
+}
+
+/** \brief returns the dense index of the bit in the given mask. */
+static really_inline
+u32 mmbit_mask_index(u32 bit, MMB_TYPE mask) {
+    assert(bit < MMB_KEY_BITS);
+    assert(mmb_test(mask, bit));
+
+    mask &= mmb_mask_zero_to(bit);
+    if (mask == 0ULL) {
+        return 0; // Common case.
+    }
+    return mmb_popcount(mask);
+}
+
+/** \brief Clear all bits. */
+static really_inline
+void mmbit_clear(u8 *bits, u32 total_bits) {
+    MDEBUG_PRINTF("%p total_bits %u\n", bits, total_bits);
+    MMB_TRACE("CLEAR\n");
+    if (!total_bits) {
+        return;
+    }
+    if (mmbit_is_flat_model(total_bits)) {
+        memset(bits, 0, mmbit_flat_size(total_bits));
+        return;
+    }
+    mmb_store(bits, 0);
+}
+
+/** \brief Specialisation of \ref mmbit_set for flat models. */
+static really_inline
+char mmbit_set_flat(u8 *bits, u32 total_bits, u32 key) {
+    bits += mmbit_flat_select_byte(key, total_bits);
+    u8 mask = 1U << (key % 8);
+    char was_set = !!(*bits & mask);
+    *bits |= mask;
+    return was_set;
+}
+
+static really_inline
+char mmbit_set_big(u8 *bits, u32 total_bits, u32 key) {
+    const u32 max_level = mmbit_maxlevel(total_bits);
+    u32 level = 0;
+    do {
+        u8 * byte_ptr = mmbit_get_byte_ptr(bits, max_level, level, key);
+        u8 keymask = 1U << mmbit_get_key_val_byte(max_level, level, key);
+        u8 byte = *byte_ptr;
+        if (likely(!(byte & keymask))) {
+            *byte_ptr = byte | keymask;
+            while (level++ != max_level) {
+                u8 *block_ptr_1 = mmbit_get_block_ptr(bits, max_level, level, key);
+                MMB_TYPE keymask_1 = mmb_single_bit(mmbit_get_key_val(max_level, level, key));
+                mmb_store(block_ptr_1, keymask_1);
+            }
+            return 0;
+        }
+    } while (level++ != max_level);
+    return 1;
+}
+
+/** Internal version of \ref mmbit_set without MMB_TRACE, so it can be used by
+ * \ref mmbit_sparse_iter_dump. */
+static really_inline
+char mmbit_set_i(u8 *bits, u32 total_bits, u32 key) {
+    assert(key < total_bits);
+    if (mmbit_is_flat_model(total_bits)) {
+        return mmbit_set_flat(bits, total_bits, key);
+    } else {
+        return mmbit_set_big(bits, total_bits, key);
+    }
+}
+
+static really_inline
+char mmbit_isset(const u8 *bits, u32 total_bits, u32 key);
+
+/** \brief Sets the given key in the multibit. Returns 0 if the key was NOT
+ * already set, 1 otherwise. */
+static really_inline
+char mmbit_set(u8 *bits, u32 total_bits, u32 key) {
+    MDEBUG_PRINTF("%p total_bits %u key %u\n", bits, total_bits, key);
+    char status = mmbit_set_i(bits, total_bits, key);
+    MMB_TRACE("SET %u (prev status: %d)\n", key, (int)status);
+    assert(mmbit_isset(bits, total_bits, key));
+    return status;
+}
+
+/** \brief Specialisation of \ref mmbit_isset for flat models. */
+static really_inline
+char mmbit_isset_flat(const u8 *bits, u32 total_bits, u32 key) {
+    bits += mmbit_flat_select_byte(key, total_bits);
+    return !!(*bits & (1U << (key % 8U)));
+}
+
+static really_inline
+char mmbit_isset_big(const u8 *bits, u32 total_bits, u32 key) {
+    const u32 max_level = mmbit_maxlevel(total_bits);
+    u32 level = 0;
+    do {
+        const u8 *block_ptr = mmbit_get_block_ptr_const(bits, max_level, level, key);
+        MMB_TYPE block = mmb_load(block_ptr);
+        if (!mmb_test(block, mmbit_get_key_val(max_level, level, key))) {
+            return 0;
+        }
+    } while (level++ != max_level);
+    return 1;
+}
+
+/** \brief Returns whether the given key is set. */
+static really_inline
+char mmbit_isset(const u8 *bits, u32 total_bits, u32 key) {
+    MDEBUG_PRINTF("%p total_bits %u key %u\n", bits, total_bits, key);
+    assert(key < total_bits);
+    if (mmbit_is_flat_model(total_bits)) {
+        return mmbit_isset_flat(bits, total_bits, key);
+    } else {
+        return mmbit_isset_big(bits, total_bits, key);
+    }
+}
+
+/** \brief Specialisation of \ref mmbit_unset for flat models. */
+static really_inline
+void mmbit_unset_flat(u8 *bits, u32 total_bits, u32 key) {
+    bits += mmbit_flat_select_byte(key, total_bits);
+    *bits &= ~(1U << (key % 8U));
+}
+
+// TODO:
+// build two versions of this - unset_dangerous that doesn't clear the summary
+// block and a regular unset that actually clears ALL the way up the levels if
+// possible - might make a utility function for the clear
+static really_inline
+void mmbit_unset_big(u8 *bits, u32 total_bits, u32 key) {
+    /* This function is lazy as it does not clear the summary block
+     * entry if the child becomes empty. This is not a correctness problem as the
+     * summary block entries are used to mean that their children are valid
+     * rather than that they have a set child. */
+    const u32 max_level = mmbit_maxlevel(total_bits);
+    u32 level = 0;
+    do {
+        u8 *block_ptr = mmbit_get_block_ptr(bits, max_level, level, key);
+        u32 key_val = mmbit_get_key_val(max_level, level, key);
+        MMB_TYPE block = mmb_load(block_ptr);
+        if (!mmb_test(block, key_val)) {
+            return;
+        }
+        if (level == max_level) {
+            mmb_clear(&block, key_val);
+            mmb_store(block_ptr, block);
+        }
+    } while (level++ != max_level);
+}
+
+/** \brief Switch off a given key. */
+static really_inline
+void mmbit_unset(u8 *bits, u32 total_bits, u32 key) {
+    MDEBUG_PRINTF("%p total_bits %u key %u\n", bits, total_bits, key);
+    assert(key < total_bits);
+    MMB_TRACE("UNSET %u (prev status: %d)\n", key,
+              (int)mmbit_isset(bits, total_bits, key));
+
+    if (mmbit_is_flat_model(total_bits)) {
+        mmbit_unset_flat(bits, total_bits, key);
+    } else {
+        mmbit_unset_big(bits, total_bits, key);
+    }
+}
+
+/** \brief Specialisation of \ref mmbit_iterate for flat models. */
+static really_inline
+u32 mmbit_iterate_flat(const u8 *bits, u32 total_bits, u32 it_in) {
+    // Short cut for single-block cases.
+    if (total_bits <= MMB_KEY_BITS) {
+        MMB_TYPE block = mmbit_get_flat_block(bits, total_bits);
+        if (it_in != MMB_INVALID) {
+            it_in++;
+            assert(it_in < total_bits);
+            block &= ~mmb_mask_zero_to(it_in);
+        }
+        if (block) {
+            return mmb_ctz(block);
+        }
+        return MMB_INVALID;
+    }
+
+    const u32 last_block = total_bits / MMB_KEY_BITS;
+    u32 start; // starting block index
+
+    if (it_in != MMB_INVALID) {
+        it_in++;
+        assert(it_in < total_bits);
+
+        start = (ROUNDUP_N(it_in, MMB_KEY_BITS) / MMB_KEY_BITS) - 1;
+        u32 start_key = start * MMB_KEY_BITS;
+        u32 block_size = MIN(MMB_KEY_BITS, total_bits - start_key);
+        MMB_TYPE block =
+            mmbit_get_flat_block(bits + (start * sizeof(MMB_TYPE)), block_size);
+        block &= ~mmb_mask_zero_to(it_in - start_key);
+
+        if (block) {
+            return start_key + mmb_ctz(block);
+        } else if (start_key + MMB_KEY_BITS >= total_bits) {
+            return MMB_INVALID; // That was the final block.
+        }
+        start++;
+    } else {
+        start = 0;
+    }
+
+    // Remaining full-sized blocks.
+    for (; start < last_block; start++) {
+        MMB_TYPE block = mmb_load(bits + (start * sizeof(MMB_TYPE)));
+        if (block) {
+            return (start * MMB_KEY_BITS) + mmb_ctz(block);
+        }
+    }
+
+    // We may have a final, smaller than full-sized, block to deal with at the
+    // end.
+    if (total_bits % MMB_KEY_BITS) {
+        u32 start_key = start * MMB_KEY_BITS;
+        u32 block_size = MIN(MMB_KEY_BITS, total_bits - start_key);
+        MMB_TYPE block =
+            mmbit_get_flat_block(bits + (start * sizeof(MMB_TYPE)), block_size);
+        if (block) {
+            return start_key + mmb_ctz(block);
+        }
+    }
+
+    return MMB_INVALID;
+}
+
+static really_inline
+u32 mmbit_iterate_big(const u8 * bits, u32 total_bits, u32 it_in) {
+    const u32 max_level = mmbit_maxlevel(total_bits);
+    u32 level = 0;
+    u32 key = 0;
+    u32 key_rem = 0;
+
+    if (it_in != MMB_INVALID) {
+        // We're continuing a previous iteration, so we need to go
+        // to max_level so we can pick up where we left off.
+        // NOTE: assumes that we're valid down the whole tree
+        key = it_in >> MMB_KEY_SHIFT;
+        key_rem = (it_in & MMB_KEY_MASK) + 1;
+        level = max_level;
+    }
+    while (1) {
+        if (key_rem < MMB_KEY_BITS) {
+            const u8 *block_ptr = mmbit_get_level_root_const(bits, level) +
+                                  key * sizeof(MMB_TYPE);
+            MMB_TYPE block
+                = mmb_load(block_ptr) & ~mmb_mask_zero_to_nocheck(key_rem);
+            if (block) {
+                key = (key << MMB_KEY_SHIFT) + mmb_ctz(block);
+                if (level++ == max_level) {
+                    break;
+                }
+                key_rem = 0;
+                continue; // jump the rootwards step if we found a 'tree' non-zero bit
+            }
+        }
+        // rootwards step (block is zero or key_rem == MMB_KEY_BITS)
+        if (level-- == 0) {
+            return MMB_INVALID; // if we don't find anything and we're at the top level, we're done
+        }
+        key_rem = (key & MMB_KEY_MASK) + 1;
+        key >>= MMB_KEY_SHIFT;
+    }
+    assert(key < total_bits);
+    assert(mmbit_isset(bits, total_bits, key));
+    return key;
+}
+
+/** \brief Unbounded iterator. Returns the index of the next set bit after \a
+ * it_in, or MMB_INVALID.
+ *
+ * Note: assumes that if you pass in a value of it_in other than MMB_INVALID,
+ * that bit must be on (assumes all its summary blocks are set).
+ */
+static really_inline
+u32 mmbit_iterate(const u8 *bits, u32 total_bits, u32 it_in) {
+    MDEBUG_PRINTF("%p total_bits %u it_in %u\n", bits, total_bits, it_in);
+    assert(it_in < total_bits || it_in == MMB_INVALID);
+    if (!total_bits) {
+        return MMB_INVALID;
+    }
+    if (it_in == total_bits - 1) {
+        return MMB_INVALID; // it_in is the last key.
+    }
+
+    u32 key;
+    if (mmbit_is_flat_model(total_bits)) {
+        key = mmbit_iterate_flat(bits, total_bits, it_in);
+    } else {
+        key = mmbit_iterate_big(bits, total_bits, it_in);
+    }
+    assert(key == MMB_INVALID || mmbit_isset(bits, total_bits, key));
+    return key;
+}
+
+/** \brief Specialisation of \ref mmbit_any and \ref mmbit_any_precise for flat
+ * models. */
+static really_inline
+char mmbit_any_flat(const u8 *bits, u32 total_bits) {
+    if (total_bits <= MMB_KEY_BITS) {
+        return !!mmbit_get_flat_block(bits, total_bits);
+    }
+
+    const u8 *end = bits + mmbit_flat_size(total_bits);
+    for (const u8 *last = end - sizeof(MMB_TYPE); bits < last;
+         bits += sizeof(MMB_TYPE)) {
+        if (mmb_load(bits)) {
+            return 1;
+        }
+    }
+
+    // Overlapping load at the end.
+    return !!mmb_load(end - sizeof(MMB_TYPE));
+}
+
+/** \brief True if any keys are (or might be) on in the given multibit.
+ *
+ * NOTE: mmbit_any is sloppy (may return true when only summary bits are set).
+ * Use \ref mmbit_any_precise if you need/want a correct answer.
+ */
+static really_inline
+char mmbit_any(const u8 *bits, u32 total_bits) {
+    MDEBUG_PRINTF("%p total_bits %u\n", bits, total_bits);
+    if (!total_bits) {
+        return 0;
+    }
+    if (mmbit_is_flat_model(total_bits)) {
+        return mmbit_any_flat(bits, total_bits);
+    }
+    return !!mmb_load(bits);
+}
+
+/** \brief True if there are any keys on. Guaranteed precise. */
+static really_inline
+char mmbit_any_precise(const u8 *bits, u32 total_bits) {
+    MDEBUG_PRINTF("%p total_bits %u\n", bits, total_bits);
+    if (!total_bits) {
+        return 0;
+    }
+    if (mmbit_is_flat_model(total_bits)) {
+        return mmbit_any_flat(bits, total_bits);
+    }
+
+    return mmbit_iterate_big(bits, total_bits, MMB_INVALID) != MMB_INVALID;
+}
+
+static really_inline
+char mmbit_all_flat(const u8 *bits, u32 total_bits) {
+    while (total_bits > MMB_KEY_BITS) {
+        if (mmb_load(bits) != MMB_ALL_ONES) {
+            return 0;
+        }
+        bits += sizeof(MMB_TYPE);
+        total_bits -= MMB_KEY_BITS;
+    }
+    while (total_bits > 8) {
+        if (*bits != 0xff) {
+            return 0;
+        }
+        bits++;
+        total_bits -= 8;
+    }
+    u8 mask = (u8)mmb_mask_zero_to_nocheck(total_bits);
+    return (*bits & mask) == mask;
+}
+
+static really_inline
+char mmbit_all_big(const u8 *bits, u32 total_bits) {
+    u32 ks = mmbit_keyshift(total_bits);
+
+    u32 level = 0;
+    for (;;) {
+        // Number of bits we expect to see switched on on this level.
+        u32 level_bits;
+        if (ks != 0) {
+            u32 next_level_width = MMB_KEY_BITS << (ks - MMB_KEY_SHIFT);
+            level_bits = ROUNDUP_N(total_bits, next_level_width) >> ks;
+        } else {
+            level_bits = total_bits;
+        }
+
+        const u8 *block_ptr = mmbit_get_level_root_const(bits, level);
+
+        // All full-size blocks should be all-ones.
+        while (level_bits >= MMB_KEY_BITS) {
+            MMB_TYPE block = mmb_load(block_ptr);
+            if (block != MMB_ALL_ONES) {
+                return 0;
+            }
+            block_ptr += sizeof(MMB_TYPE);
+            level_bits -= MMB_KEY_BITS;
+        }
+
+        // If we have bits remaining, we have a runt block on the end.
+        if (level_bits > 0) {
+            MMB_TYPE block = mmb_load(block_ptr);
+            MMB_TYPE mask = mmb_mask_zero_to_nocheck(level_bits);
+            if ((block & mask) != mask) {
+                return 0;
+            }
+        }
+
+        if (ks == 0) {
+            break;
+        }
+
+        ks -= MMB_KEY_SHIFT;
+        level++;
+    }
+
+    return 1;
+}
+
+/** \brief True if all keys are on. Guaranteed precise. */
+static really_inline
+char mmbit_all(const u8 *bits, u32 total_bits) {
+    MDEBUG_PRINTF("%p total_bits %u\n", bits, total_bits);
+
+    if (mmbit_is_flat_model(total_bits)) {
+        return mmbit_all_flat(bits, total_bits);
+    }
+    return mmbit_all_big(bits, total_bits);
+}
+
+static really_inline
+MMB_TYPE get_flat_masks(u32 base, u32 it_start, u32 it_end) {
+    if (it_end <= base) {
+        return 0;
+    }
+    u32 udiff = it_end - base;
+    MMB_TYPE mask = udiff < 64 ? mmb_mask_zero_to_nocheck(udiff) : MMB_ALL_ONES;
+    if (it_start >= base) {
+        u32 ldiff = it_start - base;
+        MMB_TYPE lmask = ldiff < 64 ? ~mmb_mask_zero_to_nocheck(ldiff) : 0;
+        mask &= lmask;
+    }
+    return mask;
+}
+
+/** \brief Specialisation of \ref mmbit_iterate_bounded for flat models. */
+static really_inline
+u32 mmbit_iterate_bounded_flat(const u8 *bits, u32 total_bits, u32 begin,
+                               u32 end) {
+    // Short cut for single-block cases.
+    if (total_bits <= MMB_KEY_BITS) {
+        MMB_TYPE block = mmbit_get_flat_block(bits, total_bits);
+        block &= get_flat_masks(0, begin, end);
+        if (block) {
+            return mmb_ctz(block);
+        }
+        return MMB_INVALID;
+    }
+
+    const u32 last_block = ROUNDDOWN_N(total_bits, MMB_KEY_BITS);
+
+    // Iterate over full-sized blocks.
+    for (u32 i = ROUNDDOWN_N(begin, MMB_KEY_BITS), e = MIN(end, last_block);
+         i < e; i += MMB_KEY_BITS) {
+        const u8 *block_ptr = bits + i / 8;
+        MMB_TYPE block = mmb_load(block_ptr);
+        block &= get_flat_masks(i, begin, end);
+        if (block) {
+            return i + mmb_ctz(block);
+        }
+    }
+
+    // Final block, which is less than full-sized.
+    if (end > last_block) {
+        const u8 *block_ptr = bits + last_block / 8;
+        u32 num_bits = total_bits - last_block;
+        MMB_TYPE block = mmbit_get_flat_block(block_ptr, num_bits);
+        block &= get_flat_masks(last_block, begin, end);
+        if (block) {
+            return last_block + mmb_ctz(block);
+        }
+    }
+
+    return MMB_INVALID;
+}
+
+static really_inline
+MMB_TYPE get_lowhi_masks(u32 level, u32 max_level, u64a block_min, u64a block_max,
+                         u64a block_base) {
+    const u32 level_shift = (max_level - level) * MMB_KEY_SHIFT;
+    u64a lshift = (block_min - block_base) >> level_shift;
+    u64a ushift = (block_max - block_base) >> level_shift;
+    MMB_TYPE lmask = lshift < 64 ? ~mmb_mask_zero_to_nocheck(lshift) : 0;
+    MMB_TYPE umask =
+        ushift < 63 ? mmb_mask_zero_to_nocheck(ushift + 1) : MMB_ALL_ONES;
+    return lmask & umask;
+}
+
+static really_inline
+u32 mmbit_iterate_bounded_big(const u8 *bits, u32 total_bits, u32 it_start, u32 it_end) {
+    u64a key = 0;
+    u32 ks = mmbit_keyshift(total_bits);
+    const u32 max_level = mmbit_maxlevel_from_keyshift(ks);
+    u32 level = 0;
+    --it_end; // make end-limit inclusive
+    for (;;) {
+        assert(level <= max_level);
+
+        u64a block_width = MMB_KEY_BITS << ks;
+        u64a block_base = key * block_width;
+        u64a block_min = MAX(it_start, block_base);
+        u64a block_max = MIN(it_end, block_base + block_width - 1);
+        const u8 *block_ptr =
+            mmbit_get_level_root_const(bits, level) + key * sizeof(MMB_TYPE);
+        MMB_TYPE block = mmb_load(block_ptr);
+        block &= get_lowhi_masks(level, max_level, block_min, block_max, block_base);
+        if (block) {
+            // Found a bit, go down a level
+            key = (key << MMB_KEY_SHIFT) + mmb_ctz(block);
+            if (level++ == max_level) {
+                return key;
+            }
+            ks -= MMB_KEY_SHIFT;
+        } else {
+            // No bit found, go up a level
+            // we know that this block didn't have any answers, so we can push
+            // our start iterator forward.
+            u64a next_start = block_base + block_width;
+            if (next_start > it_end) {
+                break;
+            }
+            if (level-- == 0) {
+                break;
+            }
+            it_start = next_start;
+            key >>= MMB_KEY_SHIFT;
+            ks += MMB_KEY_SHIFT;
+        }
+    }
+    return MMB_INVALID;
+}
+
+/** \brief Bounded iterator. Returns the index of the first set bit between
+ * it_start (inclusive) and it_end (exclusive) or MMB_INVALID if no bits are
+ * set in that range.
+ */
+static really_inline
+u32 mmbit_iterate_bounded(const u8 *bits, u32 total_bits, u32 it_start,
+                          u32 it_end) {
+    MDEBUG_PRINTF("%p total_bits %u it_start %u it_end %u\n", bits, total_bits,
+                  it_start, it_end);
+    assert(it_start <= it_end);
+    assert(it_end <= total_bits);
+    if (!total_bits || it_end == it_start) {
+        return MMB_INVALID;
+    }
+    assert(it_start < total_bits);
+    u32 key;
+    if (mmbit_is_flat_model(total_bits)) {
+        key = mmbit_iterate_bounded_flat(bits, total_bits, it_start, it_end);
+    } else {
+        key = mmbit_iterate_bounded_big(bits, total_bits, it_start, it_end);
+    }
+    assert(key == MMB_INVALID || mmbit_isset(bits, total_bits, key));
+    return key;
+}
+
+/** \brief Specialisation of \ref mmbit_unset_range for flat models. */
+static really_inline
+void mmbit_unset_range_flat(u8 *bits, u32 total_bits, u32 begin, u32 end) {
+    const u32 last_block = ROUNDDOWN_N(total_bits, MMB_KEY_BITS);
+
+    // Iterate over full-sized blocks.
+    for (u32 i = ROUNDDOWN_N(begin, MMB_KEY_BITS), e = MIN(end, last_block);
+         i < e; i += MMB_KEY_BITS) {
+        u8 *block_ptr = bits + i / 8;
+        MMB_TYPE block = mmb_load(block_ptr);
+        MMB_TYPE mask = get_flat_masks(i, begin, end);
+        mmb_store(block_ptr, block & ~mask);
+    }
+
+    // Final block, which is less than full-sized.
+    if (end > last_block) {
+        u8 *block_ptr = bits + last_block / 8;
+        u32 num_bits = total_bits - last_block;
+        MMB_TYPE block = mmbit_get_flat_block(block_ptr, num_bits);
+        MMB_TYPE mask = get_flat_masks(last_block, begin, end);
+        mmb_store_partial(block_ptr, block & ~mask, num_bits);
+    }
+}
+
+static really_inline
+void mmbit_unset_range_big(u8 *bits, const u32 total_bits, u32 begin,
+                           u32 end) {
+    // TODO: combine iterator and unset operation; completely replace this
+    u32 i = begin;
+    for (;;) {
+        i = mmbit_iterate_bounded(bits, total_bits, i, end);
+        if (i == MMB_INVALID) {
+            break;
+        }
+        mmbit_unset_big(bits, total_bits, i);
+        if (++i == end) {
+            break;
+        }
+    }
+}
+
+/** \brief Unset a whole range of bits. Ensures that all bits between \a begin
+ * (inclusive) and \a end (exclusive) are switched off.  */
+static really_inline
+void mmbit_unset_range(u8 *bits, const u32 total_bits, u32 begin, u32 end) {
+    MDEBUG_PRINTF("%p total_bits %u begin %u end %u\n", bits, total_bits, begin,
+                  end);
+    assert(begin <= end);
+    assert(end <= total_bits);
+    if (mmbit_is_flat_model(total_bits)) {
+        mmbit_unset_range_flat(bits, total_bits, begin, end);
+    } else {
+        mmbit_unset_range_big(bits, total_bits, begin, end);
+    }
+    // No bits are on in [begin, end) once we're done.
+    assert(MMB_INVALID == mmbit_iterate_bounded(bits, total_bits, begin, end));
+}
+
+/** \brief Specialisation of \ref mmbit_init_range for flat models. */
+static really_inline
+void mmbit_init_range_flat(u8 *bits, const u32 total_bits, u32 begin, u32 end) {
+    const u32 last_block = ROUNDDOWN_N(total_bits, MMB_KEY_BITS);
+
+    // Iterate over full-sized blocks.
+    for (u32 i = 0; i < last_block; i += MMB_KEY_BITS) {
+        mmb_store(bits + i / 8, get_flat_masks(i, begin, end));
+    }
+
+    // Final block, which is less than full-sized.
+    if (total_bits % MMB_KEY_BITS) {
+        u32 num_bits = total_bits - last_block;
+        MMB_TYPE block = get_flat_masks(last_block, begin, end);
+        mmb_store_partial(bits + last_block / 8, block, num_bits);
+    }
+}
+
+static really_inline
+void mmbit_init_range_big(u8 *bits, const u32 total_bits, u32 begin, u32 end) {
+    u32 ks = mmbit_keyshift(total_bits);
+    u32 level = 0;
+
+    for (;;) {
+        u8 *block = mmbit_get_level_root(bits, level);
+        u32 k1 = begin >> ks, k2 = end >> ks;
+
+        // Summary blocks need to account for the runt block on the end.
+        if ((k2 << ks) != end) {
+            k2++;
+        }
+
+        // Partial block to deal with beginning.
+        block += (k1 / MMB_KEY_BITS) * sizeof(MMB_TYPE);
+        if (k1 % MMB_KEY_BITS) {
+            u32 idx = k1 / MMB_KEY_BITS;
+            u32 block_end = (idx + 1) * MMB_KEY_BITS;
+
+            // Because k1 % MMB_KEY_BITS != 0, we can avoid checking edge cases
+            // here (see the branch in mmb_mask_zero_to).
+            MMB_TYPE mask = MMB_ALL_ONES << (k1 % MMB_KEY_BITS);
+
+            if (k2 < block_end) {
+                assert(k2 % MMB_KEY_BITS);
+                mask &= mmb_mask_zero_to_nocheck(k2 % MMB_KEY_BITS);
+                mmb_store(block, mask);
+                goto next_level;
+            } else {
+                mmb_store(block, mask);
+                k1 = block_end;
+                block += sizeof(MMB_TYPE);
+            }
+        }
+
+        // Write blocks filled with ones until we get to the last block.
+        for (; k1 < (k2 & ~MMB_KEY_MASK); k1 += MMB_KEY_BITS) {
+            mmb_store(block, MMB_ALL_ONES);
+            block += sizeof(MMB_TYPE);
+        }
+
+        // Final block.
+        if (likely(k1 < k2)) {
+            // Again, if k2 was at a block boundary, it would have been handled
+            // by the previous loop, so we know k2 % MMB_KEY_BITS != 0 and can
+            // avoid the branch in mmb_mask_zero_to here.
+            assert(k2 % MMB_KEY_BITS);
+            MMB_TYPE mask = mmb_mask_zero_to_nocheck(k2 % MMB_KEY_BITS);
+            mmb_store(block, mask);
+        }
+
+    next_level:
+        if (ks == 0) {
+            break; // Last level is done, finished.
+        }
+
+        ks -= MMB_KEY_SHIFT;
+        level++;
+    }
+}
+
+/** \brief Initialises the multibit so that only the given range of bits are
+ * set.
+ *
+ * Ensures that all bits between \a begin (inclusive) and \a end (exclusive)
+ * are switched on.
+ */
+static really_inline
+void mmbit_init_range(u8 *bits, const u32 total_bits, u32 begin, u32 end) {
+    MDEBUG_PRINTF("%p total_bits %u begin %u end %u\n", bits, total_bits, begin,
+                  end);
+    assert(begin <= end);
+    assert(end <= total_bits);
+
+    if (!total_bits) {
+        return;
+    }
+
+    // Short cut for cases where we're not actually setting any bits; just
+    // clear the multibit.
+    if (begin == end) {
+        mmbit_clear(bits, total_bits);
+        return;
+    }
+
+    if (mmbit_is_flat_model(total_bits)) {
+        mmbit_init_range_flat(bits, total_bits, begin, end);
+    } else {
+        mmbit_init_range_big(bits, total_bits, begin, end);
+    }
+
+    assert(begin == end ||
+           mmbit_iterate(bits, total_bits, MMB_INVALID) == begin);
+    assert(!end || begin == end ||
+           mmbit_iterate(bits, total_bits, end - 1) == MMB_INVALID);
+}
+
+/** \brief Determine the number of \ref mmbit_sparse_state elements required.
+ * */
+static really_inline
+u32 mmbit_sparse_iter_state_size(u32 total_bits) {
+    if (mmbit_is_flat_model(total_bits)) {
+        return 2;
+    }
+    u32 levels = mmbit_maxlevel(total_bits);
+    return levels + 1;
+}
+
+#ifdef DUMP_SUPPORT
+// Dump function, defined in multibit.c.
+void mmbit_sparse_iter_dump(const struct mmbit_sparse_iter *it, u32 total_bits);
+#endif
+
+/** Internal: common loop used by mmbit_sparse_iter_{begin,next}_big. Returns
+ * matching next key given starting state, or MMB_INVALID. */
+static really_inline
+u32 mmbit_sparse_iter_exec(const u8 *bits, u32 key, u32 *idx, u32 level,
+                           const u32 max_level, struct mmbit_sparse_state *s,
+                           const struct mmbit_sparse_iter *it_root,
+                           const struct mmbit_sparse_iter *it) {
+    for (;;) {
+        MMB_TYPE block = s[level].mask;
+        if (block) {
+            u32 bit = mmb_ctz(block);
+            key = (key << MMB_KEY_SHIFT) + bit;
+            u32 bit_idx = mmbit_mask_index(bit, it->mask);
+            if (level++ == max_level) {
+                // we've found a key
+                *idx = it->val + bit_idx;
+                return key;
+            } else {
+                // iterator record is the start of the level (current it->val)
+                // plus N, where N is the dense index of the bit in the current
+                // level's itmask
+                u32 iter_key = it->val + bit_idx;
+                it = it_root + iter_key;
+                MMB_TYPE nextblock =
+                    mmb_load(mmbit_get_level_root_const(bits, level) +
+                             key * sizeof(MMB_TYPE));
+                s[level].mask = nextblock & it->mask;
+                s[level].itkey = iter_key;
+            }
+        } else {
+            // No bits set in this block
+            if (level-- == 0) {
+                break; // no key available
+            }
+            key >>= MMB_KEY_SHIFT;
+            // Update state mask and iterator
+            s[level].mask &= (s[level].mask - 1);
+            it = it_root + s[level].itkey;
+        }
+    }
+    return MMB_INVALID;
+}
+
+static really_inline
+u32 mmbit_sparse_iter_begin_big(const u8 *bits, u32 total_bits, u32 *idx,
+                                const struct mmbit_sparse_iter *it_root,
+                                struct mmbit_sparse_state *s) {
+    const struct mmbit_sparse_iter *it = it_root;
+    u32 key = 0;
+    MMB_TYPE block = mmb_load(bits) & it->mask;
+    if (!block) {
+        return MMB_INVALID;
+    }
+
+    // Load first block into top level state.
+    const u32 max_level = mmbit_maxlevel(total_bits);
+    s[0].mask = block;
+    s[0].itkey = 0;
+    return mmbit_sparse_iter_exec(bits, key, idx, 0, max_level,
+                                  s, it_root, it);
+}
+
+/** \brief Specialisation of \ref mmbit_sparse_iter_begin for flat models. */
+static really_inline
+u32 mmbit_sparse_iter_begin_flat(const u8 *bits, u32 total_bits, u32 *idx,
+                                 const struct mmbit_sparse_iter *it_root,
+                                 struct mmbit_sparse_state *s) {
+    // Small cases have everything in the root iterator mask.
+    if (total_bits <= MMB_KEY_BITS) {
+        MMB_TYPE block = mmbit_get_flat_block(bits, total_bits);
+        block &= it_root->mask;
+        if (!block) {
+            return MMB_INVALID;
+        }
+
+        s->mask = block;
+        u32 key = mmb_ctz(block);
+        *idx = mmbit_mask_index(key, it_root->mask);
+        return key;
+    }
+
+    // Otherwise, the root iterator mask tells us which blocks (which we lay out
+    // linearly in the flat model) could contain keys.
+    assert(mmbit_maxlevel(total_bits) == 1); // Should only be two levels
+    MMB_TYPE root = it_root->mask;
+    for (; root; root &= (root - 1)) {
+        u32 bit = mmb_ctz(root);
+        u32 bit_idx = mmbit_mask_index(bit, it_root->mask);
+        u32 iter_key = it_root->val + bit_idx;
+        const struct mmbit_sparse_iter *it = it_root + iter_key;
+        u32 block_key_min = bit * MMB_KEY_BITS;
+        u32 block_key_max = block_key_min + MMB_KEY_BITS;
+        MMB_TYPE block;
+        if (block_key_max > total_bits) {
+            block_key_max = total_bits;
+            block = mmbit_get_flat_block(bits + (bit * sizeof(MMB_TYPE)),
+                                          block_key_max - block_key_min);
+        } else {
+            block = mmb_load(bits + (bit * sizeof(MMB_TYPE)));
+        }
+
+        block &= it->mask;
+        if (block) {
+            s[0].mask = root;
+            s[1].mask = block;
+            s[1].itkey = iter_key;
+            u32 key = mmb_ctz(block);
+            *idx = it->val + mmbit_mask_index(key, it->mask);
+            return key + block_key_min;
+        }
+    }
+
+    return MMB_INVALID;
+}
+
+/** \brief Sparse iterator, find first key.
+ *
+ * Returns the first of the bits specified by the iterator \a it_root that is
+ * on, and initialises the state \a s. If none of the bits specified by the
+ * iterator are on, returns MMB_INVALID.
+ */
+static really_inline
+u32 mmbit_sparse_iter_begin(const u8 *bits, u32 total_bits, u32 *idx,
+                            const struct mmbit_sparse_iter *it_root,
+                            struct mmbit_sparse_state *s) {
+    assert(ISALIGNED_N(it_root, alignof(struct mmbit_sparse_iter)));
+
+    // Our state _may_ be on the stack
+#ifndef _WIN32
+    assert(ISALIGNED_N(s, alignof(struct mmbit_sparse_state)));
+#else
+    assert(ISALIGNED_N(s, 4));
+#endif
+
+    MDEBUG_PRINTF("%p total_bits %u\n", bits, total_bits);
+    // iterator should have _something_ at the root level
+    assert(it_root->mask != 0);
+    u32 key;
+    if (mmbit_is_flat_model(total_bits)) {
+        key = mmbit_sparse_iter_begin_flat(bits, total_bits, idx, it_root, s);
+    } else {
+        key = mmbit_sparse_iter_begin_big(bits, total_bits, idx, it_root, s);
+    }
+    if (key != MMB_INVALID) {
+        assert(key < total_bits);
+        assert(mmbit_isset(bits, total_bits, key));
+    }
+    return key;
+}
+
+static really_inline
+u32 mmbit_sparse_iter_next_big(const u8 *bits, u32 total_bits, u32 last_key,
+                               u32 *idx,
+                               const struct mmbit_sparse_iter *it_root,
+                               struct mmbit_sparse_state *s) {
+    const u32 max_level = mmbit_maxlevel(total_bits);
+    u32 key = last_key >> MMB_KEY_SHIFT;
+    s[max_level].mask &= (s[max_level].mask - 1);
+    const struct mmbit_sparse_iter *it = it_root + s[max_level].itkey;
+    return mmbit_sparse_iter_exec(bits, key, idx, max_level, max_level, s,
+                                  it_root, it);
+}
+
+/** \brief Specialisation of \ref mmbit_sparse_iter_next for flat models. */
+static really_inline
+u32 mmbit_sparse_iter_next_flat(const u8 *bits, const u32 total_bits, u32 *idx,
+                                const struct mmbit_sparse_iter *it_root,
+                                struct mmbit_sparse_state *s) {
+    if (total_bits <= MMB_KEY_BITS) {
+        // All of our data is already in the s->mask, so we just need to scrape
+        // off the next match.
+        s->mask &= (s->mask - 1);
+        if (s->mask) {
+            u32 key = mmb_ctz(s->mask);
+            *idx = mmbit_mask_index(key, it_root->mask);
+            return key;
+        }
+    } else {
+        assert(s[0].mask);
+
+        s[1].mask &= (s[1].mask - 1); // Remove previous key from iter state.
+        u32 bit = mmb_ctz(s[0].mask); // Flat block currently being accessed.
+
+        for (;;) {
+            if (s[1].mask) {
+                u32 key = mmb_ctz(s[1].mask);
+                const struct mmbit_sparse_iter *it = it_root + s[1].itkey;
+                *idx = it->val + mmbit_mask_index(key, it->mask);
+                key += (bit * MMB_KEY_BITS);
+                return key;
+            }
+
+            // Otherwise, we have no keys left in this block. Consult the root
+            // mask and find the next one.
+
+            s[0].mask &= s[0].mask - 1;
+            if (!s[0].mask) {
+                break;
+            }
+
+            bit = mmb_ctz(s[0].mask);
+            u32 bit_idx = mmbit_mask_index(bit, it_root->mask);
+            u32 iter_key = it_root->val + bit_idx;
+            const struct mmbit_sparse_iter *it = it_root + iter_key;
+            u32 block_key_min = bit * MMB_KEY_BITS;
+            u32 block_key_max = block_key_min + MMB_KEY_BITS;
+            MMB_TYPE block;
+            if (block_key_max > total_bits) {
+                block_key_max = total_bits;
+                block = mmbit_get_flat_block(bits + (bit * sizeof(MMB_TYPE)),
+                                              block_key_max - block_key_min);
+            } else {
+                block = mmb_load(bits + (bit * sizeof(MMB_TYPE)));
+            }
+
+            s[1].mask = block & it->mask;
+            s[1].itkey = iter_key;
+        }
+    }
+
+    return MMB_INVALID;
+}
+
+/** \brief Sparse iterator, find next key.
+ *
+ * Takes in a sparse iterator tree structure \a it_root and a state array, and
+ * finds the next on bit (from the set of bits specified in the iterator).
+ *
+ * NOTE: The sparse iterator stores copies of the multibit blocks in its state,
+ * so it is not necessarily safe to set or unset bits in the multibit while
+ * iterating: the changes you make may or may not be taken into account
+ * by the iterator.
+ */
+static really_inline
+u32 mmbit_sparse_iter_next(const u8 *bits, u32 total_bits, u32 last_key,
+                           u32 *idx, const struct mmbit_sparse_iter *it_root,
+                           struct mmbit_sparse_state *s) {
+    assert(ISALIGNED_N(it_root, alignof(struct mmbit_sparse_iter)));
+
+    // Our state _may_ be on the stack
+#ifndef _WIN32
+    assert(ISALIGNED_N(s, alignof(struct mmbit_sparse_state)));
+#else
+    assert(ISALIGNED_N(s, 4));
+#endif
+
+    MDEBUG_PRINTF("%p total_bits %u\n", bits, total_bits);
+    MDEBUG_PRINTF("NEXT (total_bits=%u, last_key=%u)\n", total_bits, last_key);
+    UNUSED u32 last_idx = *idx; // for assertion at the end
+    // our iterator should have _something_ at the root level
+    assert(it_root->mask != 0);
+    assert(last_key < total_bits);
+
+    u32 key;
+    if (mmbit_is_flat_model(total_bits)) {
+        key = mmbit_sparse_iter_next_flat(bits, total_bits, idx, it_root, s);
+    } else {
+        key = mmbit_sparse_iter_next_big(bits, total_bits, last_key, idx,
+                                         it_root, s);
+    }
+    if (key != MMB_INVALID) {
+        MDEBUG_PRINTF("END NEXT: key=%u, idx=%u\n", key, *idx);
+        assert(key < total_bits);
+        assert(key > last_key);
+        assert(mmbit_isset(bits, total_bits, key));
+        assert(*idx > last_idx);
+    } else {
+        MDEBUG_PRINTF("END NEXT: no more keys\n");
+    }
+    return key;
+}
+
+/** \brief Specialisation of \ref mmbit_sparse_iter_unset for flat models. */
+static really_inline
+void mmbit_sparse_iter_unset_flat(u8 *bits, u32 total_bits,
+                                  const struct mmbit_sparse_iter *it_root) {
+    if (total_bits <= MMB_KEY_BITS) {
+        // Everything is in the root mask: we can just mask those bits off.
+        MMB_TYPE block = mmbit_get_flat_block(bits, total_bits);
+        block &= ~it_root->mask;
+        mmb_store_partial(bits, block, total_bits);
+        return;
+    }
+
+    // Larger case, we have two iterator levels to worry about.
+    u32 bit_idx = 0;
+    for (MMB_TYPE root = it_root->mask; root; root &= (root - 1), bit_idx++) {
+        u32 bit = mmb_ctz(root);
+        u32 block_key_min = bit * MMB_KEY_BITS;
+        u32 block_key_max = block_key_min + MMB_KEY_BITS;
+        u8 *block_ptr = bits + (bit * sizeof(MMB_TYPE));
+        u32 iter_key = it_root->val + bit_idx;
+        const struct mmbit_sparse_iter *it = it_root + iter_key;
+        if (block_key_max <= total_bits) {
+            // Full-sized block.
+            MMB_TYPE block = mmb_load(block_ptr);
+            block &= ~it->mask;
+            mmb_store(block_ptr, block);
+        } else {
+            // Runt (final) block.
+            u32 num_bits = total_bits - block_key_min;
+            MMB_TYPE block = mmbit_get_flat_block(block_ptr, num_bits);
+            block &= ~it->mask;
+            mmb_store_partial(block_ptr, block, num_bits);
+            break; // We know this is the last block.
+        }
+    }
+}
+
+static really_inline
+void mmbit_sparse_iter_unset_big(u8 *bits, u32 total_bits,
+                                 const struct mmbit_sparse_iter *it_root,
+                                 struct mmbit_sparse_state *s) {
+    const struct mmbit_sparse_iter *it = it_root;
+    MMB_TYPE block = mmb_load(bits) & it->mask;
+    if (!block) {
+        return;
+    }
+
+    u32 key = 0;
+    const u32 max_level = mmbit_maxlevel(total_bits);
+    u32 level = 0;
+
+    // Load first block into top level state
+    s[level].mask = block;
+    s[level].itkey = 0;
+    for (;;) {
+        block = s[level].mask;
+        if (block) {
+            if (level == max_level) {
+                // bottom level block: we want to mask out the bits specified
+                // by the iterator mask and then go back up a level.
+                u8 *block_ptr =
+                    mmbit_get_level_root(bits, level) + key * sizeof(MMB_TYPE);
+                MMB_TYPE real_block = mmb_load(block_ptr);
+                real_block &= ~(it->mask);
+                mmb_store(block_ptr, real_block);
+                goto uplevel; // still cheap and nasty
+            } else {
+                u32 bit = mmb_ctz(block);
+                key = (key << MMB_KEY_SHIFT) + bit;
+                level++;
+
+                // iterator record is the start of the level (current it->val)
+                // plus N, where N is the dense index of the bit in the current
+                // level's itmask
+                u32 iter_key = it->val + mmbit_mask_index(bit, it->mask);
+                it = it_root + iter_key;
+                MMB_TYPE nextblock =
+                    mmb_load(mmbit_get_level_root_const(bits, level) +
+                             key * sizeof(MMB_TYPE));
+                s[level].mask = nextblock & it->mask;
+                s[level].itkey = iter_key;
+            }
+        } else {
+uplevel:
+            // No bits set in this block
+            if (level == 0) {
+                return; // we are done
+            }
+            u8 *block_ptr =
+                mmbit_get_level_root(bits, level) + key * sizeof(MMB_TYPE);
+            MMB_TYPE real_block = mmb_load(block_ptr);
+            key >>= MMB_KEY_SHIFT;
+            level--;
+
+            if (real_block == 0) {
+                // If we've zeroed our block For Real (unmasked by iterator),
+                // we can clear the parent bit that led us to it, so that
+                // we don't go down this particular garden path again later.
+                u32 bit = mmb_ctz(s[level].mask);
+                u8 *parent_ptr =
+                    mmbit_get_level_root(bits, level) + key * sizeof(MMB_TYPE);
+                MMB_TYPE parent_block = mmb_load(parent_ptr);
+                mmb_clear(&parent_block, bit);
+                mmb_store(parent_ptr, parent_block);
+            }
+
+            // Update state mask and iterator
+            s[level].mask &= (s[level].mask - 1);
+            it = it_root + s[level].itkey;
+        }
+    }
+}
+
+/** \brief Sparse iterator, unset all bits.
+ *
+ * Takes in a sparse iterator tree structure and switches off any entries found
+ * therein.
+ */
+static really_inline
+void mmbit_sparse_iter_unset(u8 *bits, u32 total_bits,
+                             const struct mmbit_sparse_iter *it,
+                             struct mmbit_sparse_state *s) {
+    assert(ISALIGNED_N(it, alignof(struct mmbit_sparse_iter)));
+
+    // Our state _may_ be on the stack
+#ifndef _WIN32
+    assert(ISALIGNED_N(s, alignof(struct mmbit_sparse_state)));
+#else
+    assert(ISALIGNED_N(s, 4));
+#endif
+
+    MDEBUG_PRINTF("%p total_bits %u\n", bits, total_bits);
+
+#ifdef MMB_TRACE_WRITES
+    MMB_TRACE("ITER-UNSET iter=[");
+    mmbit_sparse_iter_dump(it, total_bits);
+    printf("] actually on=[");
+    struct mmbit_sparse_state tmp[MAX_SPARSE_ITER_STATES];
+    u32 idx = 0;
+    u32 i = mmbit_sparse_iter_begin(bits, total_bits, &idx, it, tmp);
+    for (; i != MMB_INVALID;
+         i = mmbit_sparse_iter_next(bits, total_bits, i, &idx, it, tmp)) {
+        printf(" %u", i);
+    }
+    printf("]\n");
+#endif
+
+    if (mmbit_is_flat_model(total_bits)) {
+        mmbit_sparse_iter_unset_flat(bits, total_bits, it);
+    } else {
+        mmbit_sparse_iter_unset_big(bits, total_bits, it, s);
+    }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // MULTIBIT_H
diff --git a/regex/util/multibit_compress.h b/regex/util/multibit_compress.h
new file mode 100644
index 000000000..e7b4fd8e8
--- /dev/null
+++ b/regex/util/multibit_compress.h
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** file
+ * \brief multibit compression API: compress / decompress / size
+ */
+
+#ifndef MULTIBIT_COMPRESS_H
+#define MULTIBIT_COMPRESS_H
+
+#include "multibit.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** \brief size API. */
+static really_inline
+size_t mmbit_compsize(const u8 *bits, u32 total_bits) {
+    // Deal with flat model.
+    if (total_bits <= MMB_FLAT_MAX_BITS) {
+        return (ROUNDUP_N(total_bits, 8) / 8);
+    }
+    // Deal with all cleared mmb.
+    if (mmb_load(bits) == 0) {
+        return sizeof(MMB_TYPE);
+    }
+    // Deal with normal pyramid mmb.
+    const u32 max_level = mmbit_maxlevel(total_bits);
+    u32 level = 0;
+    u32 key = 0;
+    u32 key_rem = 0;
+    u32 num_block = 0;
+    // Iteration-version of DFS
+    while (1) {
+        if (key_rem < MMB_KEY_BITS) {
+            const u8 *block_ptr = mmbit_get_level_root_const(bits, level) +
+                                  key * sizeof(MMB_TYPE);
+            MMB_TYPE block = mmb_load(block_ptr);
+            MMB_TYPE block_1 = block & ~mmb_mask_zero_to_nocheck(key_rem);
+            if (mmb_popcount(block) == mmb_popcount(block_1)) {
+                num_block++;
+            }
+            if (level < max_level && block_1) {
+                key = (key << MMB_KEY_SHIFT) + mmb_ctz(block_1);
+                key_rem = 0;
+                level++;
+                continue;
+            }
+        }
+        if (level-- == 0) {
+            return sizeof(MMB_TYPE) * num_block;
+        }
+        key_rem = (key & MMB_KEY_MASK) + 1;
+        key >>= MMB_KEY_SHIFT;
+    }
+}
+
+/** \brief compress API. */
+static really_inline
+char mmbit_compress(const u8 *bits, u32 total_bits, u8 *comp,
+                    size_t *comp_space, size_t max_comp_space) {
+    UNUSED u8 *comp_init = comp;
+    // Compute comp_size first.
+    size_t comp_size = mmbit_compsize(bits, total_bits);
+    // Check whether out of writable range.
+    if (comp_size > max_comp_space) {
+        return 0;
+    }
+    *comp_space = comp_size; // Return comp_size outside.
+    // Deal with flat model.
+    if (total_bits <= MMB_FLAT_MAX_BITS) {
+        memcpy(comp, bits, comp_size);
+        return 1;
+    }
+    // Deal with all cleared mmb.
+    if (mmb_load(bits) == 0) {
+        memcpy(comp, bits, sizeof(MMB_TYPE));
+        return 1;
+    }
+    // Deal with normal pyramid mmb.
+    const u32 max_level = mmbit_maxlevel(total_bits);
+    u32 level = 0;
+    u32 key = 0;
+    u32 key_rem = 0;
+    // Iteration-version of DFS
+    while (1) {
+        if (key_rem < MMB_KEY_BITS) {
+            const u8 *block_ptr = mmbit_get_level_root_const(bits, level) +
+                                  key * sizeof(MMB_TYPE);
+            MMB_TYPE block = mmb_load(block_ptr);
+            MMB_TYPE block_1 = block & ~mmb_mask_zero_to_nocheck(key_rem);
+            if (mmb_popcount(block) == mmb_popcount(block_1)) {
+                memcpy(comp, &block, sizeof(MMB_TYPE));
+                comp += sizeof(MMB_TYPE);
+            }
+            if (level < max_level && block_1) {
+                key = (key << MMB_KEY_SHIFT) + mmb_ctz(block_1);
+                key_rem = 0;
+                level++;
+                continue;
+            }
+        }
+        if (level-- == 0) {
+            break;
+        }
+        key_rem = (key & MMB_KEY_MASK) + 1;
+        key >>= MMB_KEY_SHIFT;
+    }
+    assert((u32)(comp - comp_init) == comp_size);
+    return 1;
+}
+
+/** \brief decompress API. */
+static really_inline
+char mmbit_decompress(u8 *bits, u32 total_bits, const u8 *comp,
+                      size_t *comp_space, size_t max_comp_space) {
+    UNUSED const u8 *comp_init = comp;
+    size_t comp_size;
+    // Deal with flat model.
+    if (total_bits <= MMB_FLAT_MAX_BITS) {
+        comp_size = ROUNDUP_N(total_bits, 8) / 8;
+        memcpy(bits, comp, comp_size);
+        *comp_space = comp_size;
+        return 1;
+    }
+    // Deal with all cleared mmb.
+    if (mmb_load(comp) == 0) {
+        comp_size = sizeof(MMB_TYPE);
+        memcpy(bits, comp, comp_size);
+        *comp_space = comp_size;
+        return 1;
+    }
+    // Deal with normal mmb.
+    u32 max_level = mmbit_maxlevel(total_bits);
+    u32 level = 0;
+    u32 key = 0;
+    u32 key_rem = 0;
+    UNUSED const u8 *comp_end = comp_init + max_comp_space;
+    // Iteration-version of DFS
+    memcpy(bits, comp, sizeof(MMB_TYPE)); // Copy root block first.
+    comp += sizeof(MMB_TYPE);
+    while (1) {
+        if (key_rem < MMB_KEY_BITS) {
+            u8 *block_ptr = mmbit_get_level_root(bits, level) +
+                            key * sizeof(MMB_TYPE);
+            MMB_TYPE block = mmb_load(block_ptr);
+            MMB_TYPE block_1 = block & ~mmb_mask_zero_to_nocheck(key_rem);
+            if (level < max_level && block_1) {
+                key = (key << MMB_KEY_SHIFT) + mmb_ctz(block_1);
+                u8 *block_ptr_1 = mmbit_get_level_root(bits, level + 1) +
+                                  key * sizeof(MMB_TYPE);
+                memcpy(block_ptr_1, comp, sizeof(MMB_TYPE));
+                comp += sizeof(MMB_TYPE);
+		        if (comp > comp_end) {
+                    return 0; // Out of buffer.
+                }
+                key_rem = 0;
+                level++;
+                continue;
+            }
+        }
+        if (level-- == 0) {
+            break;
+        }
+        key_rem = (key & MMB_KEY_MASK) + 1;
+        key >>= MMB_KEY_SHIFT;
+    }
+    comp_size = (u32)(comp - comp_init);
+    *comp_space = comp_size;
+    return 1;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // MULTBIT_COMPRESS_H
+
diff --git a/regex/util/multibit_internal.h b/regex/util/multibit_internal.h
new file mode 100644
index 000000000..350f3bfd4
--- /dev/null
+++ b/regex/util/multibit_internal.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Multibit: data structures.
+ *
+ * If all you need is the sizes of multibit's few structures, then including
+ * this file is a much better idea than including all of multibit.h.
+ */
+#ifndef MULTIBIT_INTERNAL_H
+#define MULTIBIT_INTERNAL_H
+
+#include "ue2common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** \brief Sentinel value meaning "no key found". */
+#define MMB_INVALID 0xffffffffu
+
+typedef u64a MMB_TYPE; /**< Basic block type for mmbit operations. */
+#define MMB_MAX_LEVEL 6 /**< Maximum level in the mmbit pyramid. */
+
+/** \brief Maximum number of keys (bits) in a multibit. */
+#define MMB_MAX_BITS (1U << 31)
+
+/** \brief Sparse iterator record type.
+ *
+ * A sparse iterator is a tree of these records, where val identifies the
+ * offset of the result for leaf nodes and points to the next record for
+ * intermediate nodes. Built by the code in multibit_build.cpp.
+ */
+struct mmbit_sparse_iter {
+    MMB_TYPE mask;
+    u32 val;
+};
+
+/** \brief Sparse iterator runtime state type.
+ *
+ * An array of these records (one per "level" in the multibit pyramid) is used
+ * to store the current iteration state.
+ */
+struct mmbit_sparse_state {
+    MMB_TYPE mask; //!< \brief masked last block read at this level.
+    u32 itkey;     //!< \brief iterator offset for this level.
+};
+
+/** \brief Maximum number of \ref mmbit_sparse_state that could be needed. */
+#define MAX_SPARSE_ITER_STATES (6 + 1)
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // MULTIBIT_INTERNAL_H
diff --git a/regex/util/pack_bits.h b/regex/util/pack_bits.h
new file mode 100644
index 000000000..800ce25ec
--- /dev/null
+++ b/regex/util/pack_bits.h
@@ -0,0 +1,227 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Functions for packing/unpacking arrays.
+ */
+
+#ifndef UTIL_PACK_BITS_H
+#define UTIL_PACK_BITS_H
+
+#include "ue2common.h"
+#include "unaligned.h"
+#include "partial_store.h"
+
+/**
+ * \brief Pack bits from an array of 32-bit words into \a out.
+ *
+ * \param out Output array. Must be large enough to store sum(bits).
+ * \param v Input array.
+ * \param bits Number of low bits in the corresponding element of \a v to pack.
+ * \param elements Size of the \a v and \a bits arrays.
+ */
+static really_inline
+void pack_bits_32(char *out, const u32 *v, const u32 *bits,
+                  const unsigned int elements);
+
+/**
+ * \brief Pack bits from an array of 64-bit words into \a out.
+ *
+ * \param out Output array. Must be large enough to store sum(bits).
+ * \param v Input array.
+ * \param bits Number of low bits in the corresponding element of \a v to pack.
+ * \param elements Size of the \a v and \a bits arrays.
+ */
+static really_inline
+void pack_bits_64(char *out, const u64a *v, const u32 *bits,
+                  const unsigned int elements);
+
+/**
+ * \brief Unpack bits into an array of 32-bit words according to the counts
+ * given.
+ *
+ * \param v Output array.
+ * \param in Packed input array.
+ * \param bits Number of bits to unpack into the corresponding element of \a v.
+ * \param elements Size of the \a v and \a bits arrays.
+ */
+static really_inline
+void unpack_bits_32(u32 *v, const u8 *in, const u32 *bits,
+                    const unsigned int elements);
+
+/**
+ * \brief Unpack bits into an array of 64-bit words according to the counts
+ * given.
+ *
+ * \param v Output array.
+ * \param in Packed input array.
+ * \param bits Number of bits to unpack into the corresponding element of \a v.
+ * \param elements Size of the \a v and \a bits arrays.
+ */
+static really_inline
+void unpack_bits_64(u64a *v, const u8 *in, const u32 *bits,
+                    const unsigned int elements);
+
+/*
+ * Inline implementations follow.
+ */
+
+static really_inline
+void pack_bits_32(char *out, const u32 *v, const u32 *bits,
+                     const unsigned int elements) {
+    u32 write = 0; // accumulator
+    u32 idx = 0;   // acc holds this many bits
+
+    for (unsigned int i = 0; i < elements; i++) {
+        assert(bits[i] <= 32);
+        write |= (v[i] << idx);
+        idx += bits[i];
+        if (idx >= 32) {
+            unaligned_store_u32(out, write);
+            out += 4;
+            idx -= 32;
+            u32 leftover = bits[i] - idx;
+            if (leftover == 32) {
+                write = 0;
+            } else {
+                assert(leftover < 32);
+                write = v[i] >> leftover;
+            }
+        }
+    }
+
+    // There might be a write left over.
+    partial_store_u32(out, write, (idx + 7) / 8);
+}
+
+static really_inline
+void pack_bits_64(char *out, const u64a *v, const u32 *bits,
+                     const unsigned int elements) {
+    u64a write = 0; // accumulator
+    u32 idx = 0;    // acc holds this many bits
+
+    for (unsigned int i = 0; i < elements; i++) {
+        assert(bits[i] <= 64);
+        write |= (v[i] << idx);
+        idx += bits[i];
+        if (idx >= 64) {
+            unaligned_store_u64a(out, write);
+            out += 8;
+            idx -= 64;
+            u32 leftover = bits[i] - idx;
+            if (leftover == 64) {
+                write = 0;
+            } else {
+                assert(leftover < 64);
+                write = v[i] >> leftover;
+            }
+        }
+    }
+
+    // There might be a write left over.
+    DEBUG_PRINTF("partial store of idx=%u\n", idx);
+    partial_store_u64a(out, write, (idx + 7) / 8);
+}
+
+static really_inline
+void unpack_bits_32(u32 *v, const u8 *in, const u32 *bits,
+                   const unsigned int elements) {
+    u32 used = 0; // bits used from *in
+
+    for (unsigned int i = 0; i < elements; i++) {
+        assert(bits[i] <= 32);
+        u32 v_out = 0;   // accumulator for v[i]
+        u32 b = bits[i]; // bits left to read for v[i]
+        u32 vidx = 0;    // bits written to v[i]
+
+        while (b) {
+            u32 read = *in >> used;
+            u32 bits_read = 8 - used;
+
+            if (b <= bits_read) {
+                u32 mask = read & ((1U << b) - 1);
+                v_out |= mask << vidx;
+                vidx += b;
+                used += b;
+                b = 0;
+                if (used < 8) {
+                    continue; // more from this *in
+                }
+            } else {
+                v_out |= read << vidx;
+                vidx += bits_read;
+                b -= bits_read;
+            }
+
+            used = 0;
+            in++;
+        }
+
+        v[i] = v_out;
+    }
+}
+
+static really_inline
+void unpack_bits_64(u64a *v, const u8 *in, const u32 *bits,
+                    const unsigned int elements) {
+    u32 used = 0; // bits used from *in
+
+    for (unsigned int i = 0; i < elements; i++) {
+        assert(bits[i] <= 64);
+        u64a v_out = 0;  // accumulator for v[i]
+        u32 b = bits[i]; // bits left to read for v[i]
+        u32 vidx = 0;    // bits written to v[i]
+
+        while (b) {
+            u64a read = *in >> used;
+            u32 bits_read = 8 - used;
+
+            if (b <= bits_read) {
+                u64a mask = read & ((1U << b) - 1);
+                v_out |= mask << vidx;
+                vidx += b;
+                used += b;
+                b = 0;
+                if (used < 8) {
+                    continue; // more from this *in
+                }
+            } else {
+                v_out |= read << vidx;
+                vidx += bits_read;
+                b -= bits_read;
+            }
+
+            used = 0;
+            in++;
+        }
+
+        v[i] = v_out;
+    }
+}
+
+#endif // UTIL_PACK_BITS_H
diff --git a/regex/util/partial_store.h b/regex/util/partial_store.h
new file mode 100644
index 000000000..a49d1fae1
--- /dev/null
+++ b/regex/util/partial_store.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef PARTIAL_STORE_H
+#define PARTIAL_STORE_H
+
+#include "ue2common.h"
+#include "unaligned.h"
+
+/* loads/stores the least significant bytes of the values. */
+
+static really_inline
+void partial_store_u32(void *ptr, u32 value, u32 numBytes) {
+    assert(numBytes <= 4);
+    switch (numBytes) {
+    case 4:
+        unaligned_store_u32(ptr, value);
+        break;
+    case 3:
+        unaligned_store_u16(ptr, (u16)value);
+        *((u8 *)ptr + 2) = (u8)(value >> 16);
+        break;
+    case 2:
+        unaligned_store_u16(ptr, (u16)value);
+        break;
+    case 1:
+        *(u8 *)ptr = (u8)value;
+        break;
+    case 0:
+        break;
+    }
+}
+
+static really_inline
+u32 partial_load_u32(const void *ptr, u32 numBytes) {
+    u32 value;
+    assert(numBytes <= 4);
+    switch (numBytes) {
+    case 4:
+        value = unaligned_load_u32(ptr);
+        return value;
+    case 3:
+        value = unaligned_load_u16(ptr);
+        value |= ((u32)(*((const u8 *)ptr + 2)) << 16);
+        return value;
+    case 2:
+        value = unaligned_load_u16(ptr);
+        return value;
+    case 1:
+        value = *(const u8 *)ptr;
+        return value;
+    case 0:
+        break;
+    }
+
+    return 0;
+}
+
+static really_inline
+void partial_store_u64a(void *ptr, u64a value, u32 numBytes) {
+    assert(numBytes <= 8);
+    switch (numBytes) {
+    case 8:
+        unaligned_store_u64a(ptr, value);
+        break;
+    case 7:
+        unaligned_store_u32(ptr, (u32)value);
+        unaligned_store_u16((u8 *)ptr + 4, (u16)(value >> 32));
+        *((u8 *)ptr + 6) = (u8)(value >> 48);
+        break;
+    case 6:
+        unaligned_store_u32(ptr, (u32)value);
+        unaligned_store_u16((u8 *)ptr + 4, (u16)(value >> 32));
+        break;
+    case 5:
+        unaligned_store_u32(ptr, (u32)value);
+        *((u8 *)ptr + 4) = (u8)(value >> 32);
+        break;
+    case 4:
+        unaligned_store_u32(ptr, (u32)value);
+        break;
+    case 3:
+        unaligned_store_u16(ptr, (u16)value);
+        *((u8 *)ptr + 2) = (u8)(value >> 16);
+        break;
+    case 2:
+        unaligned_store_u16(ptr, (u16)value);
+        break;
+    case 1:
+        *(u8 *)ptr = (u8)value;
+        break;
+    case 0:
+        break;
+    }
+}
+
+static really_inline
+u64a partial_load_u64a(const void *ptr, u32 numBytes) {
+    u64a value;
+    assert(numBytes <= 8);
+    switch (numBytes) {
+    case 8:
+        value = unaligned_load_u64a(ptr);
+        return value;
+    case 7:
+        value = unaligned_load_u32(ptr);
+        value |= (u64a)unaligned_load_u16((const u8 *)ptr + 4) << 32;
+        value |= (u64a)(*((const u8 *)ptr + 6)) << 48;
+        return value;
+    case 6:
+        value = unaligned_load_u32(ptr);
+        value |= (u64a)unaligned_load_u16((const u8 *)ptr + 4) << 32;
+        return value;
+    case 5:
+        value = unaligned_load_u32(ptr);
+        value |= (u64a)(*((const u8 *)ptr + 4)) << 32;
+        return value;
+    case 4:
+        value = unaligned_load_u32(ptr);
+        return value;
+    case 3:
+        value = unaligned_load_u16(ptr);
+        value |= (u64a)(*((const u8 *)ptr + 2)) << 16;
+        return value;
+    case 2:
+        value = unaligned_load_u16(ptr);
+        return value;
+    case 1:
+        value = *(const u8 *)ptr;
+        return value;
+    case 0:
+        break;
+    }
+
+    return 0;
+}
+
+#endif
diff --git a/regex/util/popcount.h b/regex/util/popcount.h
new file mode 100644
index 000000000..eb08f6b1b
--- /dev/null
+++ b/regex/util/popcount.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Platform specific popcount functions
+ */
+
+#ifndef UTIL_POPCOUNT_H_
+#define UTIL_POPCOUNT_H_
+
+#include "ue2common.h"
+#include "util/arch.h"
+
+static really_inline
+u32 popcount32(u32 x) {
+#if defined(HAVE_POPCOUNT_INSTR)
+    // Single-instruction builtin.
+    return _mm_popcnt_u32(x);
+#else
+    // Fast branch-free version from bit-twiddling hacks as older Intel
+    // processors do not have a POPCNT instruction.
+    x -= (x >> 1) & 0x55555555;
+    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
+    return (((x + (x >> 4)) & 0xf0f0f0f) * 0x1010101) >> 24;
+#endif
+}
+
+static really_inline
+u32 popcount64(u64a x) {
+#if defined(ARCH_X86_64)
+# if defined(HAVE_POPCOUNT_INSTR)
+    // Single-instruction builtin.
+    return (u32)_mm_popcnt_u64(x);
+# else
+    // Fast branch-free version from bit-twiddling hacks as older Intel
+    // processors do not have a POPCNT instruction.
+    x -= (x >> 1) & 0x5555555555555555;
+    x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333);
+    x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f;
+    return (x * 0x0101010101010101) >> 56;
+# endif
+#else
+    // Synthesise from two 32-bit cases.
+    return popcount32(x >> 32) + popcount32(x);
+#endif
+}
+
+#endif /* UTIL_POPCOUNT_H_ */
+
diff --git a/regex/util/pqueue.h b/regex/util/pqueue.h
new file mode 100644
index 000000000..f0ba12e70
--- /dev/null
+++ b/regex/util/pqueue.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef PQUEUE_H
+#define PQUEUE_H
+
+#include "ue2common.h"
+
+static really_inline u32
+pq_left(u32 i) {
+    return (i << 1) + 1;
+}
+
+static really_inline u32
+pq_right(u32 i) {
+    return (i << 1) + 2;
+}
+
+static really_inline
+u32 pq_parent(u32 i) {
+    return (i - 1) >> 1;
+}
+
+static really_inline
+void pq_sift(PQ_T *items, u32 start, u32 end) {
+    u32 j = start;
+    PQ_T j_temp = items[j];
+
+    while (pq_left(j) < end) {
+        u32 max_child;
+
+        if (pq_right(j) < end && PQ_COMP(items, pq_right(j), pq_left(j))) {
+            max_child = pq_right(j);
+        } else {
+            max_child = pq_left(j);
+        }
+
+        if (PQ_COMP_B(items, max_child, j_temp)) {
+            items[j] = items[max_child];
+            j = max_child;
+        } else {
+            /* j is already less than its children. We know heap property
+             * is already maintained for children we are done */
+            break;
+        }
+    }
+    items[j] = j_temp;
+}
+
+static really_inline
+PQ_T *pq_top(PQ_T *items) {
+    return items;
+}
+
+static really_inline
+void pq_pop(PQ_T *items, u32 item_count) {
+    item_count--;
+    items[0] = items[item_count];
+    pq_sift(items, 0, item_count);
+}
+
+static really_inline
+void pq_insert(PQ_T *items, u32 item_count, PQ_T new_item) {
+    u32 pos = item_count;
+    while (pos) {
+        u32 parent = pq_parent(pos);
+        if (!PQ_COMP_B(items, parent, new_item)) {
+            items[pos] = items[parent];
+            pos = parent;
+        } else {
+            break;
+        }
+    }
+    items[pos] = new_item;
+}
+
+static really_inline
+void pq_replace_top(PQ_T *items, u32 item_count, PQ_T new_item) {
+    items[0] = new_item;
+    pq_sift(items, 0, item_count);
+}
+
+#endif
+
diff --git a/regex/util/scatter.h b/regex/util/scatter.h
new file mode 100644
index 000000000..40a1ab248
--- /dev/null
+++ b/regex/util/scatter.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef UTIL_SCATTER_H
+#define UTIL_SCATTER_H
+
+#include "ue2common.h"
+
+#define SCATTER_STRUCT(t) \
+    struct scatter_unit_##t { u32 offset; t val; };
+
+SCATTER_STRUCT(u64a)
+SCATTER_STRUCT(u32)
+SCATTER_STRUCT(u16)
+SCATTER_STRUCT(u8)
+
+struct scatter_full_plan {
+    u32 s_u64a_offset;
+    u32 s_u64a_count;
+    u32 s_u32_offset;
+    u32 s_u32_count;
+    u32 s_u16_offset;
+    u32 s_u16_count;
+    u32 s_u8_count;
+    u32 s_u8_offset;
+};
+
+#undef SCATTER_STRUCT
+
+#endif
diff --git a/regex/util/scatter_runtime.h b/regex/util/scatter_runtime.h
new file mode 100644
index 000000000..09bc742d9
--- /dev/null
+++ b/regex/util/scatter_runtime.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef UTIL_SCATTER_RUNTIME_H
+#define UTIL_SCATTER_RUNTIME_H
+
+#include "scatter.h"
+
+#include "uniform_ops.h"
+
+#define SCATTER_DEF(t)                                                        \
+static really_inline                                                          \
+void scatter_##t(void *out, const struct scatter_unit_##t *plan, u32 count) { \
+    for (u32 i = 0; i < count; i++) {                                         \
+        const struct scatter_unit_##t *item = plan + i;                       \
+        DEBUG_PRINTF("storing %llu into offset %u\n", (u64a)item->val, \
+                     item->offset);                                     \
+        storeu_##t((char *)out + item->offset, item->val);              \
+    }                                                                         \
+}
+
+SCATTER_DEF(u64a)
+SCATTER_DEF(u32)
+SCATTER_DEF(u16)
+SCATTER_DEF(u8)
+
+#undef SCATTER_DEF
+
+static really_inline
+void scatter(void *out, const void *base, const struct scatter_full_plan *p) {
+#define RUN_SUB(t)                                      \
+    if (p->s_##t##_offset) {                            \
+        assert(p->s_##t##_count);                       \
+        const struct scatter_unit_##t *pp               \
+            = (const void *)(b + p->s_##t##_offset);    \
+        scatter_##t(out, pp, p->s_##t##_count);         \
+    }
+
+    const char *b = base;
+
+    RUN_SUB(u64a);
+    RUN_SUB(u32);
+    RUN_SUB(u16);
+    RUN_SUB(u8);
+
+#undef RUN_SUB
+}
+
+#endif
diff --git a/regex/util/simd_types.h b/regex/util/simd_types.h
new file mode 100644
index 000000000..962cad6c9
--- /dev/null
+++ b/regex/util/simd_types.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SIMD_TYPES_H
+#define SIMD_TYPES_H
+
+#include "config.h"
+#include "util/arch.h"
+#include "util/intrinsics.h"
+#include "ue2common.h"
+
+#if defined(HAVE_SSE2)
+typedef __m128i m128;
+#else
+typedef struct ALIGN_DIRECTIVE {u64a hi; u64a lo;} m128;
+#endif
+
+#if defined(HAVE_AVX2)
+typedef __m256i m256;
+#else
+typedef struct ALIGN_AVX_DIRECTIVE {m128 lo; m128 hi;} m256;
+#endif
+
+typedef struct {m128 lo; m128 mid; m128 hi;} m384;
+#if defined(HAVE_AVX512)
+typedef __m512i m512;
+#else
+typedef struct ALIGN_ATTR(64) {m256 lo; m256 hi;} m512;
+#endif
+
+#endif /* SIMD_TYPES_H */
+
diff --git a/regex/util/simd_utils.c b/regex/util/simd_utils.c
new file mode 100644
index 000000000..25a81412e
--- /dev/null
+++ b/regex/util/simd_utils.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Lookup tables to support SIMD operations.
+ */
+
+#include "simd_utils.h"
+
+ALIGN_CL_DIRECTIVE const char vbs_mask_data[] = {
+    0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
+    0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
+
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+
+    0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
+    0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
+};
+
+#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
+#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0
+#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8
+
+/** \brief LUT for the mask1bit functions. */
+ALIGN_CL_DIRECTIVE const u8 simd_onebit_masks[] = {
+    ZEROES_32, ZEROES_32,
+    ZEROES_31, 0x01, ZEROES_32,
+    ZEROES_31, 0x02, ZEROES_32,
+    ZEROES_31, 0x04, ZEROES_32,
+    ZEROES_31, 0x08, ZEROES_32,
+    ZEROES_31, 0x10, ZEROES_32,
+    ZEROES_31, 0x20, ZEROES_32,
+    ZEROES_31, 0x40, ZEROES_32,
+    ZEROES_31, 0x80, ZEROES_32,
+    ZEROES_32, ZEROES_32,
+};
diff --git a/regex/util/simd_utils.h b/regex/util/simd_utils.h
new file mode 100644
index 000000000..d828f591b
--- /dev/null
+++ b/regex/util/simd_utils.h
@@ -0,0 +1,1424 @@
+/*
+ * Copyright (c) 2015-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief SIMD types and primitive operations.
+ */
+
+#ifndef SIMD_UTILS
+#define SIMD_UTILS
+
+#if !defined(_WIN32) && !defined(__SSSE3__)
+#error SSSE3 instructions must be enabled
+#endif
+
+#include "config.h"
+#include "ue2common.h"
+#include "simd_types.h"
+#include "unaligned.h"
+#include "util/arch.h"
+#include "util/intrinsics.h"
+
+#ifndef __KERNEL__
+#include <string.h> // for memcpy
+#else
+#include <linux/string.h>
+#endif
+
+// Define a common assume_aligned using an appropriate compiler built-in, if
+// it's available. Note that we need to handle C or C++ compilation.
+#ifdef __cplusplus
+#  ifdef HAVE_CXX_BUILTIN_ASSUME_ALIGNED
+#    define assume_aligned(x, y) __builtin_assume_aligned((x), (y))
+#  endif
+#else
+#  ifdef HAVE_CC_BUILTIN_ASSUME_ALIGNED
+#    define assume_aligned(x, y) __builtin_assume_aligned((x), (y))
+#  endif
+#endif
+
+// Fallback to identity case.
+#ifndef assume_aligned
+#define assume_aligned(x, y) (x)
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern const char vbs_mask_data[];
+#ifdef __cplusplus
+}
+#endif
+
+static really_inline m128 ones128(void) {
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+    /* gcc gets this right */
+    return _mm_set1_epi8(0xFF);
+#else
+    /* trick from Intel's optimization guide to generate all-ones.
+     * ICC converts this to the single cmpeq instruction */
+    return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128());
+#endif
+}
+
+static really_inline m128 zeroes128(void) {
+    return _mm_setzero_si128();
+}
+
+/** \brief Bitwise not for m128*/
+static really_inline m128 not128(m128 a) {
+    return _mm_xor_si128(a, ones128());
+}
+
+/** \brief Return 1 if a and b are different otherwise 0 */
+static really_inline int diff128(m128 a, m128 b) {
+    return (_mm_movemask_epi8(_mm_cmpeq_epi8(a, b)) ^ 0xffff);
+}
+
+static really_inline int isnonzero128(m128 a) {
+    return !!diff128(a, zeroes128());
+}
+
+/**
+ * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit
+ * mask indicating which 32-bit words contain differences.
+ */
+static really_inline u32 diffrich128(m128 a, m128 b) {
+    a = _mm_cmpeq_epi32(a, b);
+    return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0xf;
+}
+
+/**
+ * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and
+ * returns a 4-bit mask indicating which 64-bit words contain differences.
+ */
+static really_inline u32 diffrich64_128(m128 a, m128 b) {
+#if defined(HAVE_SSE41)
+    a = _mm_cmpeq_epi64(a, b);
+    return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0x5;
+#else
+    u32 d = diffrich128(a, b);
+    return (d | (d >> 1)) & 0x5;
+#endif
+}
+
+static really_really_inline
+m128 lshift64_m128(m128 a, unsigned b) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(b)) {
+        return _mm_slli_epi64(a, b);
+    }
+#endif
+    m128 x = _mm_cvtsi32_si128(b);
+    return _mm_sll_epi64(a, x);
+}
+
+#define rshift64_m128(a, b) _mm_srli_epi64((a), (b))
+#define eq128(a, b)      _mm_cmpeq_epi8((a), (b))
+#define movemask128(a)  ((u32)_mm_movemask_epi8((a)))
+
+#if defined(HAVE_AVX512)
+static really_inline m128 cast512to128(const m512 in) {
+    return _mm512_castsi512_si128(in);
+}
+#endif
+
+static really_inline m128 set16x8(u8 c) {
+    return _mm_set1_epi8(c);
+}
+
+static really_inline m128 set4x32(u32 c) {
+    return _mm_set1_epi32(c);
+}
+
+static really_inline u32 movd(const m128 in) {
+    return _mm_cvtsi128_si32(in);
+}
+
+#if defined(HAVE_AVX512)
+static really_inline u32 movd512(const m512 in) {
+    // NOTE: seems gcc doesn't support _mm512_cvtsi512_si32(in),
+    //       so we use 2-step convertions to work around.
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(in));
+}
+
+static really_inline u64a movq512(const m512 in) {
+    // NOTE: seems AVX512 doesn't support _mm512_cvtsi512_si64(in),
+    //       so we use 2-step convertions to work around.
+    return _mm_cvtsi128_si64(_mm512_castsi512_si128(in));
+}
+#endif
+
+static really_inline u64a movq(const m128 in) {
+#if defined(ARCH_X86_64)
+    return _mm_cvtsi128_si64(in);
+#else // 32-bit - this is horrific
+    u32 lo = movd(in);
+    u32 hi = movd(_mm_srli_epi64(in, 32));
+    return (u64a)hi << 32 | lo;
+#endif
+}
+
+/* another form of movq */
+static really_inline
+m128 load_m128_from_u64a(const u64a *p) {
+    return _mm_set_epi64x(0LL, *p);
+}
+
+#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed)
+#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed)
+
+#if defined(HAVE_SSE41)
+#define extract32from128(a, imm) _mm_extract_epi32(a, imm)
+#define extract64from128(a, imm) _mm_extract_epi64(a, imm)
+#else
+#define extract32from128(a, imm) movd(_mm_srli_si128(a, imm << 2))
+#define extract64from128(a, imm) movq(_mm_srli_si128(a, imm << 3))
+#endif
+
+#if !defined(HAVE_AVX2)
+// TODO: this entire file needs restructuring - this carveout is awful
+#define extractlow64from256(a) movq(a.lo)
+#define extractlow32from256(a) movd(a.lo)
+#if defined(HAVE_SSE41)
+#define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4)
+#define extract64from256(a, imm) _mm_extract_epi64((imm >> 1) ? a.hi : a.lo, imm % 2)
+#else
+#define extract32from256(a, imm) movd(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 4))
+#define extract64from256(a, imm) movq(_mm_srli_si128((imm >> 1) ? a.hi : a.lo, (imm % 2) * 8))
+#endif
+
+#endif // !AVX2
+
+static really_inline m128 and128(m128 a, m128 b) {
+    return _mm_and_si128(a,b);
+}
+
+static really_inline m128 xor128(m128 a, m128 b) {
+    return _mm_xor_si128(a,b);
+}
+
+static really_inline m128 or128(m128 a, m128 b) {
+    return _mm_or_si128(a,b);
+}
+
+#if defined(HAVE_AVX512VBMI)
+static really_inline m512 expand128(m128 a) {
+    return _mm512_broadcast_i32x4(a);
+}
+
+static really_inline m512 expand256(m256 a) {
+    return _mm512_broadcast_i64x4(a);
+}
+
+static really_inline m512 expand384(m384 a) {
+    u64a *lo = (u64a*)&a.lo;
+    u64a *mid = (u64a*)&a.mid;
+    u64a *hi = (u64a*)&a.hi;
+    return _mm512_set_epi64(0ULL, 0ULL, hi[1], hi[0], mid[1], mid[0],
+                            lo[1], lo[0]);
+}
+#endif
+
+static really_inline m128 andnot128(m128 a, m128 b) {
+    return _mm_andnot_si128(a, b);
+}
+
+// aligned load
+static really_inline m128 load128(const void *ptr) {
+    assert(ISALIGNED_N(ptr, alignof(m128)));
+    ptr = assume_aligned(ptr, 16);
+    return _mm_load_si128((const m128 *)ptr);
+}
+
+// aligned store
+static really_inline void store128(void *ptr, m128 a) {
+    assert(ISALIGNED_N(ptr, alignof(m128)));
+    ptr = assume_aligned(ptr, 16);
+    *(m128 *)ptr = a;
+}
+
+// unaligned load
+static really_inline m128 loadu128(const void *ptr) {
+    return _mm_loadu_si128((const m128 *)ptr);
+}
+
+// unaligned store
+static really_inline void storeu128(void *ptr, m128 a) {
+    _mm_storeu_si128 ((m128 *)ptr, a);
+}
+
+// packed unaligned store of first N bytes
+static really_inline
+void storebytes128(void *ptr, m128 a, unsigned int n) {
+    assert(n <= sizeof(a));
+    memcpy(ptr, &a, n);
+}
+
+// packed unaligned load of first N bytes, pad with zero
+static really_inline
+m128 loadbytes128(const void *ptr, unsigned int n) {
+    m128 a = zeroes128();
+    assert(n <= sizeof(a));
+    memcpy(&a, ptr, n);
+    return a;
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern const u8 simd_onebit_masks[];
+#ifdef __cplusplus
+}
+#endif
+
+static really_inline
+m128 mask1bit128(unsigned int n) {
+    assert(n < sizeof(m128) * 8);
+    u32 mask_idx = ((n % 8) * 64) + 95;
+    mask_idx -= n / 8;
+    return loadu128(&simd_onebit_masks[mask_idx]);
+}
+
+// switches on bit N in the given vector.
+static really_inline
+void setbit128(m128 *ptr, unsigned int n) {
+    *ptr = or128(mask1bit128(n), *ptr);
+}
+
+// switches off bit N in the given vector.
+static really_inline
+void clearbit128(m128 *ptr, unsigned int n) {
+    *ptr = andnot128(mask1bit128(n), *ptr);
+}
+
+// tests bit N in the given vector.
+static really_inline
+char testbit128(m128 val, unsigned int n) {
+    const m128 mask = mask1bit128(n);
+#if defined(HAVE_SSE41)
+    return !_mm_testz_si128(mask, val);
+#else
+    return isnonzero128(and128(mask, val));
+#endif
+}
+
+// offset must be an immediate
+#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset)
+
+static really_inline
+m128 pshufb_m128(m128 a, m128 b) {
+    m128 result;
+    result = _mm_shuffle_epi8(a, b);
+    return result;
+}
+
+static really_inline
+m256 pshufb_m256(m256 a, m256 b) {
+#if defined(HAVE_AVX2)
+    return _mm256_shuffle_epi8(a, b);
+#else
+    m256 rv;
+    rv.lo = pshufb_m128(a.lo, b.lo);
+    rv.hi = pshufb_m128(a.hi, b.hi);
+    return rv;
+#endif
+}
+
+#if defined(HAVE_AVX512)
+static really_inline
+m512 pshufb_m512(m512 a, m512 b) {
+    return _mm512_shuffle_epi8(a, b);
+}
+
+static really_inline
+m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) {
+    return _mm512_maskz_shuffle_epi8(k, a, b);
+}
+
+#if defined(HAVE_AVX512VBMI)
+#define vpermb512(idx, a) _mm512_permutexvar_epi8(idx, a)
+#define maskz_vpermb512(k, idx, a) _mm512_maskz_permutexvar_epi8(k, idx, a)
+#endif
+
+#endif
+
+static really_inline
+m128 variable_byte_shift_m128(m128 in, s32 amount) {
+    assert(amount >= -16 && amount <= 16);
+    m128 shift_mask = loadu128(vbs_mask_data + 16 - amount);
+    return pshufb_m128(in, shift_mask);
+}
+
+static really_inline
+m128 max_u8_m128(m128 a, m128 b) {
+    return _mm_max_epu8(a, b);
+}
+
+static really_inline
+m128 min_u8_m128(m128 a, m128 b) {
+    return _mm_min_epu8(a, b);
+}
+
+static really_inline
+m128 sadd_u8_m128(m128 a, m128 b) {
+    return _mm_adds_epu8(a, b);
+}
+
+static really_inline
+m128 sub_u8_m128(m128 a, m128 b) {
+    return _mm_sub_epi8(a, b);
+}
+
+static really_inline
+m128 set64x2(u64a hi, u64a lo) {
+    return _mm_set_epi64x(hi, lo);
+}
+
+/****
+ **** 256-bit Primitives
+ ****/
+
+#if defined(HAVE_AVX2)
+
+static really_really_inline
+m256 lshift64_m256(m256 a, unsigned b) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(b)) {
+        return _mm256_slli_epi64(a, b);
+    }
+#endif
+    m128 x = _mm_cvtsi32_si128(b);
+    return _mm256_sll_epi64(a, x);
+}
+
+#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b))
+
+static really_inline
+m256 set32x8(u32 in) {
+    return _mm256_set1_epi8(in);
+}
+
+#define eq256(a, b)     _mm256_cmpeq_epi8((a), (b))
+#define movemask256(a)  ((u32)_mm256_movemask_epi8((a)))
+
+static really_inline
+m256 set2x128(m128 a) {
+    return _mm256_broadcastsi128_si256(a);
+}
+
+#else
+
+static really_really_inline
+m256 lshift64_m256(m256 a, int b) {
+    m256 rv = a;
+    rv.lo = lshift64_m128(rv.lo, b);
+    rv.hi = lshift64_m128(rv.hi, b);
+    return rv;
+}
+
+static really_inline
+m256 rshift64_m256(m256 a, int b) {
+    m256 rv = a;
+    rv.lo = rshift64_m128(rv.lo, b);
+    rv.hi = rshift64_m128(rv.hi, b);
+    return rv;
+}
+static really_inline
+m256 set32x8(u32 in) {
+    m256 rv;
+    rv.lo = set16x8((u8) in);
+    rv.hi = rv.lo;
+    return rv;
+}
+
+static really_inline
+m256 eq256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = eq128(a.lo, b.lo);
+    rv.hi = eq128(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline
+u32 movemask256(m256 a) {
+    u32 lo_mask = movemask128(a.lo);
+    u32 hi_mask = movemask128(a.hi);
+    return lo_mask | (hi_mask << 16);
+}
+
+static really_inline
+m256 set2x128(m128 a) {
+    m256 rv = {a, a};
+    return rv;
+}
+#endif
+
+static really_inline m256 zeroes256(void) {
+#if defined(HAVE_AVX2)
+    return _mm256_setzero_si256();
+#else
+    m256 rv = {zeroes128(), zeroes128()};
+    return rv;
+#endif
+}
+
+static really_inline m256 ones256(void) {
+#if defined(HAVE_AVX2)
+    m256 rv = _mm256_set1_epi8(0xFF);
+#else
+    m256 rv = {ones128(), ones128()};
+#endif
+    return rv;
+}
+
+#if defined(HAVE_AVX2)
+static really_inline m256 and256(m256 a, m256 b) {
+    return _mm256_and_si256(a, b);
+}
+#else
+static really_inline m256 and256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = and128(a.lo, b.lo);
+    rv.hi = and128(a.hi, b.hi);
+    return rv;
+}
+#endif
+
+#if defined(HAVE_AVX2)
+static really_inline m256 or256(m256 a, m256 b) {
+    return _mm256_or_si256(a, b);
+}
+#else
+static really_inline m256 or256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = or128(a.lo, b.lo);
+    rv.hi = or128(a.hi, b.hi);
+    return rv;
+}
+#endif
+
+#if defined(HAVE_AVX2)
+static really_inline m256 xor256(m256 a, m256 b) {
+    return _mm256_xor_si256(a, b);
+}
+#else
+static really_inline m256 xor256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = xor128(a.lo, b.lo);
+    rv.hi = xor128(a.hi, b.hi);
+    return rv;
+}
+#endif
+
+#if defined(HAVE_AVX2)
+static really_inline m256 not256(m256 a) {
+    return _mm256_xor_si256(a, ones256());
+}
+#else
+static really_inline m256 not256(m256 a) {
+    m256 rv;
+    rv.lo = not128(a.lo);
+    rv.hi = not128(a.hi);
+    return rv;
+}
+#endif
+
+#if defined(HAVE_AVX2)
+static really_inline m256 andnot256(m256 a, m256 b) {
+    return _mm256_andnot_si256(a, b);
+}
+#else
+static really_inline m256 andnot256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = andnot128(a.lo, b.lo);
+    rv.hi = andnot128(a.hi, b.hi);
+    return rv;
+}
+#endif
+
+static really_inline int diff256(m256 a, m256 b) {
+#if defined(HAVE_AVX2)
+    return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1);
+#else
+    return diff128(a.lo, b.lo) || diff128(a.hi, b.hi);
+#endif
+}
+
+static really_inline int isnonzero256(m256 a) {
+#if defined(HAVE_AVX2)
+    return !!diff256(a, zeroes256());
+#else
+    return isnonzero128(or128(a.lo, a.hi));
+#endif
+}
+
+/**
+ * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit
+ * mask indicating which 32-bit words contain differences.
+ */
+static really_inline u32 diffrich256(m256 a, m256 b) {
+#if defined(HAVE_AVX2)
+    a = _mm256_cmpeq_epi32(a, b);
+    return ~(_mm256_movemask_ps(_mm256_castsi256_ps(a))) & 0xFF;
+#else
+    m128 z = zeroes128();
+    a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
+    a.hi = _mm_cmpeq_epi32(a.hi, b.hi);
+    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.hi), z);
+    return ~(_mm_movemask_epi8(packed)) & 0xff;
+#endif
+}
+
+/**
+ * "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and
+ * returns an 8-bit mask indicating which 64-bit words contain differences.
+ */
+static really_inline u32 diffrich64_256(m256 a, m256 b) {
+    u32 d = diffrich256(a, b);
+    return (d | (d >> 1)) & 0x55555555;
+}
+
+// aligned load
+static really_inline m256 load256(const void *ptr) {
+    assert(ISALIGNED_N(ptr, alignof(m256)));
+#if defined(HAVE_AVX2)
+    return _mm256_load_si256((const m256 *)ptr);
+#else
+    m256 rv = { load128(ptr), load128((const char *)ptr + 16) };
+    return rv;
+#endif
+}
+
+// aligned load  of 128-bit value to low and high part of 256-bit value
+static really_inline m256 load2x128(const void *ptr) {
+#if defined(HAVE_AVX2)
+    return set2x128(load128(ptr));
+#else
+    assert(ISALIGNED_N(ptr, alignof(m128)));
+    m256 rv;
+    rv.hi = rv.lo = load128(ptr);
+    return rv;
+#endif
+}
+
+static really_inline m256 loadu2x128(const void *ptr) {
+    return set2x128(loadu128(ptr));
+}
+
+// aligned store
+static really_inline void store256(void *ptr, m256 a) {
+    assert(ISALIGNED_N(ptr, alignof(m256)));
+#if defined(HAVE_AVX2)
+    _mm256_store_si256((m256 *)ptr, a);
+#else
+    ptr = assume_aligned(ptr, 16);
+    *(m256 *)ptr = a;
+#endif
+}
+
+// unaligned load
+static really_inline m256 loadu256(const void *ptr) {
+#if defined(HAVE_AVX2)
+    return _mm256_loadu_si256((const m256 *)ptr);
+#else
+    m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) };
+    return rv;
+#endif
+}
+
+// unaligned store
+static really_inline void storeu256(void *ptr, m256 a) {
+#if defined(HAVE_AVX2)
+    _mm256_storeu_si256((m256 *)ptr, a);
+#else
+    storeu128(ptr, a.lo);
+    storeu128((char *)ptr + 16, a.hi);
+#endif
+}
+
+// packed unaligned store of first N bytes
+static really_inline
+void storebytes256(void *ptr, m256 a, unsigned int n) {
+    assert(n <= sizeof(a));
+    memcpy(ptr, &a, n);
+}
+
+// packed unaligned load of first N bytes, pad with zero
+static really_inline
+m256 loadbytes256(const void *ptr, unsigned int n) {
+    m256 a = zeroes256();
+    assert(n <= sizeof(a));
+    memcpy(&a, ptr, n);
+    return a;
+}
+
+static really_inline
+m256 mask1bit256(unsigned int n) {
+    assert(n < sizeof(m256) * 8);
+    u32 mask_idx = ((n % 8) * 64) + 95;
+    mask_idx -= n / 8;
+    return loadu256(&simd_onebit_masks[mask_idx]);
+}
+
+static really_inline
+m256 set64x4(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) {
+#if defined(HAVE_AVX2)
+    return _mm256_set_epi64x(hi_1, hi_0, lo_1, lo_0);
+#else
+    m256 rv;
+    rv.hi = set64x2(hi_1, hi_0);
+    rv.lo = set64x2(lo_1, lo_0);
+    return rv;
+#endif
+}
+
+#if !defined(HAVE_AVX2)
+// switches on bit N in the given vector.
+static really_inline
+void setbit256(m256 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+    m128 *sub;
+    if (n < 128) {
+        sub = &ptr->lo;
+    } else {
+        sub = &ptr->hi;
+        n -= 128;
+    }
+    setbit128(sub, n);
+}
+
+// switches off bit N in the given vector.
+static really_inline
+void clearbit256(m256 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+    m128 *sub;
+    if (n < 128) {
+        sub = &ptr->lo;
+    } else {
+        sub = &ptr->hi;
+        n -= 128;
+    }
+    clearbit128(sub, n);
+}
+
+// tests bit N in the given vector.
+static really_inline
+char testbit256(m256 val, unsigned int n) {
+    assert(n < sizeof(val) * 8);
+    m128 sub;
+    if (n < 128) {
+        sub = val.lo;
+    } else {
+        sub = val.hi;
+        n -= 128;
+    }
+    return testbit128(sub, n);
+}
+
+static really_really_inline
+m128 movdq_hi(m256 x) {
+    return x.hi;
+}
+
+static really_really_inline
+m128 movdq_lo(m256 x) {
+    return x.lo;
+}
+
+static really_inline
+m256 combine2x128(m128 hi, m128 lo) {
+    m256 rv = {lo, hi};
+    return rv;
+}
+
+#else // AVX2
+
+// switches on bit N in the given vector.
+static really_inline
+void setbit256(m256 *ptr, unsigned int n) {
+    *ptr = or256(mask1bit256(n), *ptr);
+}
+
+static really_inline
+void clearbit256(m256 *ptr, unsigned int n) {
+    *ptr = andnot256(mask1bit256(n), *ptr);
+}
+
+// tests bit N in the given vector.
+static really_inline
+char testbit256(m256 val, unsigned int n) {
+    const m256 mask = mask1bit256(n);
+    return !_mm256_testz_si256(mask, val);
+}
+
+static really_really_inline
+m128 movdq_hi(m256 x) {
+    return _mm256_extracti128_si256(x, 1);
+}
+
+static really_really_inline
+m128 movdq_lo(m256 x) {
+    return _mm256_extracti128_si256(x, 0);
+}
+
+#define cast256to128(a) _mm256_castsi256_si128(a)
+#define cast128to256(a) _mm256_castsi128_si256(a)
+#define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E)
+#define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm)
+#define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed)
+#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed)
+#define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2)
+#define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4)
+#define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a))
+#define extractlow32from256(a) movd(cast256to128(a))
+#define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b)
+#define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b)
+#define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset)
+
+static really_inline
+m256 combine2x128(m128 hi, m128 lo) {
+#if defined(_mm256_set_m128i)
+    return _mm256_set_m128i(hi, lo);
+#else
+    return insert128to256(cast128to256(lo), hi, 1);
+#endif
+}
+#endif //AVX2
+
+#if defined(HAVE_AVX512)
+#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm)
+#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b)
+#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b)
+#define set2x256(a) _mm512_broadcast_i64x4(a)
+#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a)
+#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a)
+#endif
+
+/****
+ **** 384-bit Primitives
+ ****/
+
+static really_inline m384 and384(m384 a, m384 b) {
+    m384 rv;
+    rv.lo = and128(a.lo, b.lo);
+    rv.mid = and128(a.mid, b.mid);
+    rv.hi = and128(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline m384 or384(m384 a, m384 b) {
+    m384 rv;
+    rv.lo = or128(a.lo, b.lo);
+    rv.mid = or128(a.mid, b.mid);
+    rv.hi = or128(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline m384 xor384(m384 a, m384 b) {
+    m384 rv;
+    rv.lo = xor128(a.lo, b.lo);
+    rv.mid = xor128(a.mid, b.mid);
+    rv.hi = xor128(a.hi, b.hi);
+    return rv;
+}
+static really_inline m384 not384(m384 a) {
+    m384 rv;
+    rv.lo = not128(a.lo);
+    rv.mid = not128(a.mid);
+    rv.hi = not128(a.hi);
+    return rv;
+}
+static really_inline m384 andnot384(m384 a, m384 b) {
+    m384 rv;
+    rv.lo = andnot128(a.lo, b.lo);
+    rv.mid = andnot128(a.mid, b.mid);
+    rv.hi = andnot128(a.hi, b.hi);
+    return rv;
+}
+
+static really_really_inline
+m384 lshift64_m384(m384 a, unsigned b) {
+    m384 rv;
+    rv.lo = lshift64_m128(a.lo, b);
+    rv.mid = lshift64_m128(a.mid, b);
+    rv.hi = lshift64_m128(a.hi, b);
+    return rv;
+}
+
+static really_inline m384 zeroes384(void) {
+    m384 rv = {zeroes128(), zeroes128(), zeroes128()};
+    return rv;
+}
+
+static really_inline m384 ones384(void) {
+    m384 rv = {ones128(), ones128(), ones128()};
+    return rv;
+}
+
+static really_inline int diff384(m384 a, m384 b) {
+    return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi);
+}
+
+static really_inline int isnonzero384(m384 a) {
+    return isnonzero128(or128(or128(a.lo, a.mid), a.hi));
+}
+
+/**
+ * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit
+ * mask indicating which 32-bit words contain differences.
+ */
+static really_inline u32 diffrich384(m384 a, m384 b) {
+    m128 z = zeroes128();
+    a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
+    a.mid = _mm_cmpeq_epi32(a.mid, b.mid);
+    a.hi = _mm_cmpeq_epi32(a.hi, b.hi);
+    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.mid),
+                                  _mm_packs_epi32(a.hi, z));
+    return ~(_mm_movemask_epi8(packed)) & 0xfff;
+}
+
+/**
+ * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and
+ * returns a 12-bit mask indicating which 64-bit words contain differences.
+ */
+static really_inline u32 diffrich64_384(m384 a, m384 b) {
+    u32 d = diffrich384(a, b);
+    return (d | (d >> 1)) & 0x55555555;
+}
+
+// aligned load
+static really_inline m384 load384(const void *ptr) {
+    assert(ISALIGNED_16(ptr));
+    m384 rv = { load128(ptr), load128((const char *)ptr + 16),
+                load128((const char *)ptr + 32) };
+    return rv;
+}
+
+// aligned store
+static really_inline void store384(void *ptr, m384 a) {
+    assert(ISALIGNED_16(ptr));
+    ptr = assume_aligned(ptr, 16);
+    *(m384 *)ptr = a;
+}
+
+// unaligned load
+static really_inline m384 loadu384(const void *ptr) {
+    m384 rv = { loadu128(ptr), loadu128((const char *)ptr + 16),
+                loadu128((const char *)ptr + 32)};
+    return rv;
+}
+
+// packed unaligned store of first N bytes
+static really_inline
+void storebytes384(void *ptr, m384 a, unsigned int n) {
+    assert(n <= sizeof(a));
+    memcpy(ptr, &a, n);
+}
+
+// packed unaligned load of first N bytes, pad with zero
+static really_inline
+m384 loadbytes384(const void *ptr, unsigned int n) {
+    m384 a = zeroes384();
+    assert(n <= sizeof(a));
+    memcpy(&a, ptr, n);
+    return a;
+}
+
+// switches on bit N in the given vector.
+static really_inline
+void setbit384(m384 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+    m128 *sub;
+    if (n < 128) {
+        sub = &ptr->lo;
+    } else if (n < 256) {
+        sub = &ptr->mid;
+    } else {
+        sub = &ptr->hi;
+    }
+    setbit128(sub, n % 128);
+}
+
+// switches off bit N in the given vector.
+static really_inline
+void clearbit384(m384 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+    m128 *sub;
+    if (n < 128) {
+        sub = &ptr->lo;
+    } else if (n < 256) {
+        sub = &ptr->mid;
+    } else {
+        sub = &ptr->hi;
+    }
+    clearbit128(sub, n % 128);
+}
+
+// tests bit N in the given vector.
+static really_inline
+char testbit384(m384 val, unsigned int n) {
+    assert(n < sizeof(val) * 8);
+    m128 sub;
+    if (n < 128) {
+        sub = val.lo;
+    } else if (n < 256) {
+        sub = val.mid;
+    } else {
+        sub = val.hi;
+    }
+    return testbit128(sub, n % 128);
+}
+
+/****
+ **** 512-bit Primitives
+ ****/
+
+#define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b))
+#define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b))
+
+static really_inline
+m512 zeroes512(void) {
+#if defined(HAVE_AVX512)
+    return _mm512_setzero_si512();
+#else
+    m512 rv = {zeroes256(), zeroes256()};
+    return rv;
+#endif
+}
+
+static really_inline
+m512 ones512(void) {
+#if defined(HAVE_AVX512)
+    return _mm512_set1_epi8(0xFF);
+    //return _mm512_xor_si512(_mm512_setzero_si512(), _mm512_setzero_si512());
+#else
+    m512 rv = {ones256(), ones256()};
+    return rv;
+#endif
+}
+
+#if defined(HAVE_AVX512)
+static really_inline
+m512 set64x8(u8 a) {
+    return _mm512_set1_epi8(a);
+}
+
+static really_inline
+m512 set8x64(u64a a) {
+    return _mm512_set1_epi64(a);
+}
+
+static really_inline
+m512 set16x32(u32 a) {
+    return _mm512_set1_epi32(a);
+}
+
+static really_inline
+m512 set512_64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0,
+               u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) {
+    return _mm512_set_epi64(hi_3, hi_2, hi_1, hi_0,
+                            lo_3, lo_2, lo_1, lo_0);
+}
+
+static really_inline
+m512 swap256in512(m512 a) {
+    m512 idx = set512_64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL);
+    return vpermq512(idx, a);
+}
+
+static really_inline
+m512 set4x128(m128 a) {
+    return _mm512_broadcast_i32x4(a);
+}
+
+static really_inline
+m512 sadd_u8_m512(m512 a, m512 b) {
+    return _mm512_adds_epu8(a, b);
+}
+
+static really_inline
+m512 max_u8_m512(m512 a, m512 b) {
+    return _mm512_max_epu8(a, b);
+}
+
+static really_inline
+m512 min_u8_m512(m512 a, m512 b) {
+    return _mm512_min_epu8(a, b);
+}
+
+static really_inline
+m512 sub_u8_m512(m512 a, m512 b) {
+    return _mm512_sub_epi8(a, b);
+}
+#endif
+
+static really_inline
+m512 and512(m512 a, m512 b) {
+#if defined(HAVE_AVX512)
+    return _mm512_and_si512(a, b);
+#else
+    m512 rv;
+    rv.lo = and256(a.lo, b.lo);
+    rv.hi = and256(a.hi, b.hi);
+    return rv;
+#endif
+}
+
+static really_inline
+m512 or512(m512 a, m512 b) {
+#if defined(HAVE_AVX512)
+    return _mm512_or_si512(a, b);
+#else
+    m512 rv;
+    rv.lo = or256(a.lo, b.lo);
+    rv.hi = or256(a.hi, b.hi);
+    return rv;
+#endif
+}
+
+static really_inline
+m512 xor512(m512 a, m512 b) {
+#if defined(HAVE_AVX512)
+    return _mm512_xor_si512(a, b);
+#else
+    m512 rv;
+    rv.lo = xor256(a.lo, b.lo);
+    rv.hi = xor256(a.hi, b.hi);
+    return rv;
+#endif
+}
+
+static really_inline
+m512 not512(m512 a) {
+#if defined(HAVE_AVX512)
+    return _mm512_xor_si512(a, ones512());
+#else
+    m512 rv;
+    rv.lo = not256(a.lo);
+    rv.hi = not256(a.hi);
+    return rv;
+#endif
+}
+
+static really_inline
+m512 andnot512(m512 a, m512 b) {
+#if defined(HAVE_AVX512)
+    return _mm512_andnot_si512(a, b);
+#else
+    m512 rv;
+    rv.lo = andnot256(a.lo, b.lo);
+    rv.hi = andnot256(a.hi, b.hi);
+    return rv;
+#endif
+}
+
+#if defined(HAVE_AVX512)
+static really_really_inline
+m512 lshift64_m512(m512 a, unsigned b) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(b)) {
+        return _mm512_slli_epi64(a, b);
+    }
+#endif
+    m128 x = _mm_cvtsi32_si128(b);
+    return _mm512_sll_epi64(a, x);
+}
+#else
+static really_really_inline
+m512 lshift64_m512(m512 a, unsigned b) {
+    m512 rv;
+    rv.lo = lshift64_m256(a.lo, b);
+    rv.hi = lshift64_m256(a.hi, b);
+    return rv;
+}
+#endif
+
+#if defined(HAVE_AVX512)
+#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b))
+#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed)
+#define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed)
+#endif
+
+#if !defined(_MM_CMPINT_NE)
+#define _MM_CMPINT_NE 0x4
+#endif
+
+static really_inline
+int diff512(m512 a, m512 b) {
+#if defined(HAVE_AVX512)
+    return !!_mm512_cmp_epi8_mask(a, b, _MM_CMPINT_NE);
+#else
+    return diff256(a.lo, b.lo) || diff256(a.hi, b.hi);
+#endif
+}
+
+static really_inline
+int isnonzero512(m512 a) {
+#if defined(HAVE_AVX512)
+    return diff512(a, zeroes512());
+#elif defined(HAVE_AVX2)
+    m256 x = or256(a.lo, a.hi);
+    return !!diff256(x, zeroes256());
+#else
+    m128 x = or128(a.lo.lo, a.lo.hi);
+    m128 y = or128(a.hi.lo, a.hi.hi);
+    return isnonzero128(or128(x, y));
+#endif
+}
+
+/**
+ * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit
+ * mask indicating which 32-bit words contain differences.
+ */
+static really_inline
+u32 diffrich512(m512 a, m512 b) {
+#if defined(HAVE_AVX512)
+    return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_NE);
+#elif defined(HAVE_AVX2)
+    return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8);
+#else
+    a.lo.lo = _mm_cmpeq_epi32(a.lo.lo, b.lo.lo);
+    a.lo.hi = _mm_cmpeq_epi32(a.lo.hi, b.lo.hi);
+    a.hi.lo = _mm_cmpeq_epi32(a.hi.lo, b.hi.lo);
+    a.hi.hi = _mm_cmpeq_epi32(a.hi.hi, b.hi.hi);
+    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo.lo, a.lo.hi),
+                                  _mm_packs_epi32(a.hi.lo, a.hi.hi));
+    return ~(_mm_movemask_epi8(packed)) & 0xffff;
+#endif
+}
+
+/**
+ * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and
+ * returns a 16-bit mask indicating which 64-bit words contain differences.
+ */
+static really_inline
+u32 diffrich64_512(m512 a, m512 b) {
+    //TODO: cmp_epi64?
+    u32 d = diffrich512(a, b);
+    return (d | (d >> 1)) & 0x55555555;
+}
+
+// aligned load
+static really_inline
+m512 load512(const void *ptr) {
+#if defined(HAVE_AVX512)
+    return _mm512_load_si512(ptr);
+#else
+    assert(ISALIGNED_N(ptr, alignof(m256)));
+    m512 rv = { load256(ptr), load256((const char *)ptr + 32) };
+    return rv;
+#endif
+}
+
+// aligned store
+static really_inline
+void store512(void *ptr, m512 a) {
+    assert(ISALIGNED_N(ptr, alignof(m512)));
+#if defined(HAVE_AVX512)
+    return _mm512_store_si512(ptr, a);
+#elif defined(HAVE_AVX2)
+    m512 *x = (m512 *)ptr;
+    store256(&x->lo, a.lo);
+    store256(&x->hi, a.hi);
+#else
+    ptr = assume_aligned(ptr, 16);
+    *(m512 *)ptr = a;
+#endif
+}
+
+// unaligned load
+static really_inline
+m512 loadu512(const void *ptr) {
+#if defined(HAVE_AVX512)
+    return _mm512_loadu_si512(ptr);
+#else
+    m512 rv = { loadu256(ptr), loadu256((const char *)ptr + 32) };
+    return rv;
+#endif
+}
+
+// unaligned store
+static really_inline
+void storeu512(void *ptr, m512 a) {
+#if defined(HAVE_AVX512)
+    _mm512_storeu_si512((m512 *)ptr, a);
+#elif defined(HAVE_AVX2)
+    storeu256(ptr, a.lo);
+    storeu256((char *)ptr + 32, a.hi);
+#else
+    storeu128(ptr, a.lo.lo);
+    storeu128((char *)ptr + 16, a.lo.hi);
+    storeu128((char *)ptr + 32, a.hi.lo);
+    storeu128((char *)ptr + 48, a.hi.hi);
+#endif
+}
+
+#if defined(HAVE_AVX512)
+static really_inline
+m512 loadu_maskz_m512(__mmask64 k, const void *ptr) {
+    return _mm512_maskz_loadu_epi8(k, ptr);
+}
+
+static really_inline
+m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) {
+    return _mm512_mask_loadu_epi8(src, k, ptr);
+}
+
+static really_inline
+void storeu_mask_m512(void *ptr, __mmask64 k, m512 a) {
+    _mm512_mask_storeu_epi8(ptr, k, a);
+}
+
+static really_inline
+m512 set_mask_m512(__mmask64 k) {
+    return _mm512_movm_epi8(k);
+}
+
+static really_inline
+m256 loadu_maskz_m256(__mmask32 k, const void *ptr) {
+    return _mm256_maskz_loadu_epi8(k, ptr);
+}
+#endif
+
+// packed unaligned store of first N bytes
+static really_inline
+void storebytes512(void *ptr, m512 a, unsigned int n) {
+    assert(n <= sizeof(a));
+    memcpy(ptr, &a, n);
+}
+
+// packed unaligned load of first N bytes, pad with zero
+static really_inline
+m512 loadbytes512(const void *ptr, unsigned int n) {
+    m512 a = zeroes512();
+    assert(n <= sizeof(a));
+    memcpy(&a, ptr, n);
+    return a;
+}
+
+static really_inline
+m512 mask1bit512(unsigned int n) {
+    assert(n < sizeof(m512) * 8);
+    u32 mask_idx = ((n % 8) * 64) + 95;
+    mask_idx -= n / 8;
+    return loadu512(&simd_onebit_masks[mask_idx]);
+}
+
+// switches on bit N in the given vector.
+static really_inline
+void setbit512(m512 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+#if !defined(HAVE_AVX2)
+    m128 *sub;
+    if (n < 128) {
+        sub = &ptr->lo.lo;
+    } else if (n < 256) {
+        sub = &ptr->lo.hi;
+    } else if (n < 384) {
+        sub = &ptr->hi.lo;
+    } else {
+        sub = &ptr->hi.hi;
+    }
+    setbit128(sub, n % 128);
+#elif defined(HAVE_AVX512)
+    *ptr = or512(mask1bit512(n), *ptr);
+#else
+    m256 *sub;
+    if (n < 256) {
+        sub = &ptr->lo;
+    } else {
+        sub = &ptr->hi;
+        n -= 256;
+    }
+    setbit256(sub, n);
+#endif
+}
+
+// switches off bit N in the given vector.
+static really_inline
+void clearbit512(m512 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+#if !defined(HAVE_AVX2)
+    m128 *sub;
+    if (n < 128) {
+        sub = &ptr->lo.lo;
+    } else if (n < 256) {
+        sub = &ptr->lo.hi;
+    } else if (n < 384) {
+        sub = &ptr->hi.lo;
+    } else {
+        sub = &ptr->hi.hi;
+    }
+    clearbit128(sub, n % 128);
+#elif defined(HAVE_AVX512)
+    *ptr = andnot512(mask1bit512(n), *ptr);
+#else
+    m256 *sub;
+    if (n < 256) {
+        sub = &ptr->lo;
+    } else {
+        sub = &ptr->hi;
+        n -= 256;
+    }
+    clearbit256(sub, n);
+#endif
+}
+
+// tests bit N in the given vector.
+static really_inline
+char testbit512(m512 val, unsigned int n) {
+    assert(n < sizeof(val) * 8);
+#if !defined(HAVE_AVX2)
+    m128 sub;
+    if (n < 128) {
+        sub = val.lo.lo;
+    } else if (n < 256) {
+        sub = val.lo.hi;
+    } else if (n < 384) {
+        sub = val.hi.lo;
+    } else {
+        sub = val.hi.hi;
+    }
+    return testbit128(sub, n % 128);
+#elif defined(HAVE_AVX512)
+    const m512 mask = mask1bit512(n);
+    return !!_mm512_test_epi8_mask(mask, val);
+#else
+    m256 sub;
+    if (n < 256) {
+        sub = val.lo;
+    } else {
+        sub = val.hi;
+        n -= 256;
+    }
+    return testbit256(sub, n);
+#endif
+}
+
+#endif
diff --git a/regex/util/state_compress.c b/regex/util/state_compress.c
new file mode 100644
index 000000000..e29d5935d
--- /dev/null
+++ b/regex/util/state_compress.c
@@ -0,0 +1,617 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Mask-based state compression, used by the NFA.
+ */
+#include "config.h"
+#include "ue2common.h"
+#include "arch.h"
+#include "bitutils.h"
+#include "unaligned.h"
+#include "pack_bits.h"
+#include "partial_store.h"
+#include "popcount.h"
+#include "state_compress.h"
+
+#ifndef __KERNEL__
+#include <string.h>
+#else
+#include <linux/string.h>
+#endif
+
+/*
+ * 32-bit store/load.
+ */
+
+void storecompressed32(void *ptr, const u32 *x, const u32 *m, u32 bytes) {
+    assert(popcount32(*m) <= bytes * 8);
+
+    u32 v = compress32(*x, *m);
+    partial_store_u32(ptr, v, bytes);
+}
+
+void loadcompressed32(u32 *x, const void *ptr, const u32 *m, u32 bytes) {
+    assert(popcount32(*m) <= bytes * 8);
+
+    u32 v = partial_load_u32(ptr, bytes);
+    *x = expand32(v, *m);
+}
+
+/*
+ * 64-bit store/load.
+ */
+
+void storecompressed64(void *ptr, const u64a *x, const u64a *m, u32 bytes) {
+    assert(popcount64(*m) <= bytes * 8);
+
+    u64a v = compress64(*x, *m);
+    partial_store_u64a(ptr, v, bytes);
+}
+
+void loadcompressed64(u64a *x, const void *ptr, const u64a *m, u32 bytes) {
+    assert(popcount64(*m) <= bytes * 8);
+
+    u64a v = partial_load_u64a(ptr, bytes);
+    *x = expand64(v, *m);
+}
+
+/*
+ * 128-bit store/load.
+ */
+
+#if defined(ARCH_32_BIT)
+static really_inline
+void storecompressed128_32bit(void *ptr, m128 xvec, m128 mvec) {
+    // First, decompose our vectors into 32-bit chunks.
+    u32 x[4];
+    memcpy(x, &xvec, sizeof(xvec));
+    u32 m[4];
+    memcpy(m, &mvec, sizeof(mvec));
+
+    // Count the number of bits of compressed state we're writing out per
+    // chunk.
+    u32 bits[4] = { popcount32(m[0]), popcount32(m[1]),
+                    popcount32(m[2]), popcount32(m[3]) };
+
+    // Compress each 32-bit chunk individually.
+    u32 v[4] = { compress32(x[0], m[0]), compress32(x[1], m[1]),
+                 compress32(x[2], m[2]), compress32(x[3], m[3]) };
+
+    // Write packed data out.
+    pack_bits_32(ptr, v, bits, 4);
+}
+#endif
+
+#if defined(ARCH_64_BIT)
+static really_inline
+void storecompressed128_64bit(void *ptr, m128 xvec, m128 mvec) {
+    // First, decompose our vectors into 64-bit chunks.
+    u64a x[2];
+    memcpy(x, &xvec, sizeof(xvec));
+    u64a m[2];
+    memcpy(m, &mvec, sizeof(mvec));
+
+    // Count the number of bits of compressed state we're writing out per
+    // chunk.
+    u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) };
+
+    // Compress each 64-bit chunk individually.
+    u64a v[2] = { compress64(x[0], m[0]), compress64(x[1], m[1]) };
+
+    // Write packed data out.
+    pack_bits_64(ptr, v, bits, 2);
+}
+#endif
+
+void storecompressed128(void *ptr, const m128 *x, const m128 *m,
+                        UNUSED u32 bytes) {
+#if defined(ARCH_64_BIT)
+    storecompressed128_64bit(ptr, *x, *m);
+#else
+    storecompressed128_32bit(ptr, *x, *m);
+#endif
+}
+
+#if defined(ARCH_32_BIT)
+static really_inline
+m128 loadcompressed128_32bit(const void *ptr, m128 mvec) {
+    // First, decompose our vectors into 32-bit chunks.
+    u32 m[8];
+    memcpy(m, &mvec, sizeof(mvec));
+
+    u32 bits[4] = { popcount32(m[0]), popcount32(m[1]),
+                    popcount32(m[2]), popcount32(m[3]) };
+    u32 v[4];
+
+    unpack_bits_32(v, (const u8 *)ptr, bits, 4);
+
+    u32 x[4] = { expand32(v[0], m[0]), expand32(v[1], m[1]),
+                 expand32(v[2], m[2]), expand32(v[3], m[3]) };
+
+    return _mm_set_epi32(x[3], x[2], x[1], x[0]);
+}
+#endif
+
+#if defined(ARCH_64_BIT)
+static really_inline
+m128 loadcompressed128_64bit(const void *ptr, m128 mvec) {
+    // First, decompose our vectors into 64-bit chunks.
+    u64a m[2] = { movq(mvec), movq(_mm_srli_si128(mvec, 8)) };
+
+    u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) };
+    u64a v[2];
+
+    unpack_bits_64(v, (const u8 *)ptr, bits, 2);
+
+    u64a x[2] = { expand64(v[0], m[0]), expand64(v[1], m[1]) };
+
+    return _mm_set_epi64x(x[1], x[0]);
+}
+#endif
+
+void loadcompressed128(m128 *x, const void *ptr, const m128 *m,
+                       UNUSED u32 bytes) {
+#if defined(ARCH_64_BIT)
+    *x = loadcompressed128_64bit(ptr, *m);
+#else
+    *x = loadcompressed128_32bit(ptr, *m);
+#endif
+}
+
+/*
+ * 256-bit store/load.
+ */
+
+#if defined(ARCH_32_BIT)
+static really_inline
+void storecompressed256_32bit(void *ptr, m256 xvec, m256 mvec) {
+    // First, decompose our vectors into 32-bit chunks.
+    u32 x[8];
+    memcpy(x, &xvec, sizeof(xvec));
+    u32 m[8];
+    memcpy(m, &mvec, sizeof(mvec));
+
+    // Count the number of bits of compressed state we're writing out per
+    // chunk.
+    u32 bits[8] = { popcount32(m[0]), popcount32(m[1]),
+                    popcount32(m[2]), popcount32(m[3]),
+                    popcount32(m[4]), popcount32(m[5]),
+                    popcount32(m[6]), popcount32(m[7])};
+
+    // Compress each 32-bit chunk individually.
+    u32 v[8] = { compress32(x[0], m[0]), compress32(x[1], m[1]),
+                 compress32(x[2], m[2]), compress32(x[3], m[3]),
+                 compress32(x[4], m[4]), compress32(x[5], m[5]),
+                 compress32(x[6], m[6]), compress32(x[7], m[7]) };
+
+    // Write packed data out.
+    pack_bits_32(ptr, v, bits, 8);
+}
+#endif
+
+#if defined(ARCH_64_BIT)
+static really_really_inline
+void storecompressed256_64bit(void *ptr, m256 xvec, m256 mvec) {
+    // First, decompose our vectors into 64-bit chunks.
+    u64a x[4];
+    memcpy(x, &xvec, sizeof(xvec));
+    u64a m[4];
+    memcpy(m, &mvec, sizeof(mvec));
+
+    // Count the number of bits of compressed state we're writing out per
+    // chunk.
+    u32 bits[4] = { popcount64(m[0]), popcount64(m[1]),
+                    popcount64(m[2]), popcount64(m[3]) };
+
+    // Compress each 64-bit chunk individually.
+    u64a v[4] = { compress64(x[0], m[0]), compress64(x[1], m[1]),
+                  compress64(x[2], m[2]), compress64(x[3], m[3]) };
+
+    // Write packed data out.
+    pack_bits_64(ptr, v, bits, 4);
+}
+#endif
+
+void storecompressed256(void *ptr, const m256 *x, const m256 *m,
+                        UNUSED u32 bytes) {
+#if defined(ARCH_64_BIT)
+    storecompressed256_64bit(ptr, *x, *m);
+#else
+    storecompressed256_32bit(ptr, *x, *m);
+#endif
+}
+
+#if defined(ARCH_32_BIT)
+static really_inline
+m256 loadcompressed256_32bit(const void *ptr, m256 mvec) {
+    // First, decompose our vectors into 32-bit chunks.
+    u32 m[8];
+    memcpy(m, &mvec, sizeof(mvec));
+
+    u32 bits[8] = { popcount32(m[0]), popcount32(m[1]),
+                    popcount32(m[2]), popcount32(m[3]),
+                    popcount32(m[4]), popcount32(m[5]),
+                    popcount32(m[6]), popcount32(m[7])};
+    u32 v[8];
+
+    unpack_bits_32(v, (const u8 *)ptr, bits, 8);
+
+    u32 x[8] = { expand32(v[0], m[0]), expand32(v[1], m[1]),
+                 expand32(v[2], m[2]), expand32(v[3], m[3]),
+                 expand32(v[4], m[4]), expand32(v[5], m[5]),
+                 expand32(v[6], m[6]), expand32(v[7], m[7]) };
+
+#if !defined(HAVE_AVX2)
+    m256 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]),
+                  .hi = _mm_set_epi32(x[7], x[6], x[5], x[4]) };
+#else
+    m256 xvec = _mm256_set_epi32(x[7], x[6], x[5], x[4],
+                                 x[3], x[2], x[1], x[0]);
+#endif
+    return xvec;
+}
+#endif
+
+#if defined(ARCH_64_BIT)
+static really_inline
+m256 loadcompressed256_64bit(const void *ptr, m256 mvec) {
+    // First, decompose our vectors into 64-bit chunks.
+    u64a m[4];
+    memcpy(m, &mvec, sizeof(mvec));
+
+    u32 bits[4] = { popcount64(m[0]), popcount64(m[1]),
+                    popcount64(m[2]), popcount64(m[3]) };
+    u64a v[4];
+
+    unpack_bits_64(v, (const u8 *)ptr, bits, 4);
+
+    u64a x[4] = { expand64(v[0], m[0]), expand64(v[1], m[1]),
+                  expand64(v[2], m[2]), expand64(v[3], m[3]) };
+
+#if !defined(HAVE_AVX2)
+    m256 xvec = { .lo = _mm_set_epi64x(x[1], x[0]),
+                  .hi = _mm_set_epi64x(x[3], x[2]) };
+#else
+    m256 xvec = _mm256_set_epi64x(x[3], x[2], x[1], x[0]);
+#endif
+    return xvec;
+}
+#endif
+
+void loadcompressed256(m256 *x, const void *ptr, const m256 *m,
+                       UNUSED u32 bytes) {
+#if defined(ARCH_64_BIT)
+    *x = loadcompressed256_64bit(ptr, *m);
+#else
+    *x = loadcompressed256_32bit(ptr, *m);
+#endif
+}
+
+/*
+ * 384-bit store/load.
+ */
+
+#if defined(ARCH_32_BIT)
+static really_inline
+void storecompressed384_32bit(void *ptr, m384 xvec, m384 mvec) {
+    // First, decompose our vectors into 32-bit chunks.
+    u32 x[12];
+    memcpy(x, &xvec, sizeof(xvec));
+    u32 m[12];
+    memcpy(m, &mvec, sizeof(mvec));
+
+    // Count the number of bits of compressed state we're writing out per
+    // chunk.
+    u32 bits[12] = { popcount32(m[0]), popcount32(m[1]),
+                     popcount32(m[2]), popcount32(m[3]),
+                     popcount32(m[4]), popcount32(m[5]),
+                     popcount32(m[6]), popcount32(m[7]),
+                     popcount32(m[8]), popcount32(m[9]),
+                     popcount32(m[10]), popcount32(m[11]) };
+
+    // Compress each 32-bit chunk individually.
+    u32 v[12] = { compress32(x[0], m[0]), compress32(x[1], m[1]),
+                  compress32(x[2], m[2]), compress32(x[3], m[3]),
+                  compress32(x[4], m[4]), compress32(x[5], m[5]),
+                  compress32(x[6], m[6]), compress32(x[7], m[7]),
+                  compress32(x[8], m[8]), compress32(x[9], m[9]),
+                  compress32(x[10], m[10]), compress32(x[11], m[11])};
+
+    // Write packed data out.
+    pack_bits_32(ptr, v, bits, 12);
+}
+#endif
+
+#if defined(ARCH_64_BIT)
+static really_inline
+void storecompressed384_64bit(void *ptr, m384 xvec, m384 mvec) {
+    // First, decompose our vectors into 64-bit chunks.
+    u64a x[6];
+    memcpy(x, &xvec, sizeof(xvec));
+    u64a m[6];
+    memcpy(m, &mvec, sizeof(mvec));
+
+    // Count the number of bits of compressed state we're writing out per
+    // chunk.
+    u32 bits[6] = { popcount64(m[0]), popcount64(m[1]),
+                    popcount64(m[2]), popcount64(m[3]),
+                    popcount64(m[4]), popcount64(m[5]) };
+
+    // Compress each 64-bit chunk individually.
+    u64a v[6] = { compress64(x[0], m[0]), compress64(x[1], m[1]),
+                  compress64(x[2], m[2]), compress64(x[3], m[3]),
+                  compress64(x[4], m[4]), compress64(x[5], m[5]) };
+
+    // Write packed data out.
+    pack_bits_64(ptr, v, bits, 6);
+}
+#endif
+
+void storecompressed384(void *ptr, const m384 *x, const m384 *m,
+                        UNUSED u32 bytes) {
+#if defined(ARCH_64_BIT)
+    storecompressed384_64bit(ptr, *x, *m);
+#else
+    storecompressed384_32bit(ptr, *x, *m);
+#endif
+}
+
+#if defined(ARCH_32_BIT)
+static really_inline
+m384 loadcompressed384_32bit(const void *ptr, m384 mvec) {
+    // First, decompose our vectors into 32-bit chunks.
+    u32 m[12];
+    memcpy(m, &mvec, sizeof(mvec));
+
+    u32 bits[12] = { popcount32(m[0]), popcount32(m[1]),
+                     popcount32(m[2]), popcount32(m[3]),
+                     popcount32(m[4]), popcount32(m[5]),
+                     popcount32(m[6]), popcount32(m[7]),
+                     popcount32(m[8]), popcount32(m[9]),
+                     popcount32(m[10]), popcount32(m[11]) };
+    u32 v[12];
+
+    unpack_bits_32(v, (const u8 *)ptr, bits, 12);
+
+    u32 x[12] = { expand32(v[0], m[0]), expand32(v[1], m[1]),
+                  expand32(v[2], m[2]), expand32(v[3], m[3]),
+                  expand32(v[4], m[4]), expand32(v[5], m[5]),
+                  expand32(v[6], m[6]), expand32(v[7], m[7]),
+                  expand32(v[8], m[8]), expand32(v[9], m[9]),
+                  expand32(v[10], m[10]), expand32(v[11], m[11]) };
+
+    m384 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]),
+                  .mid = _mm_set_epi32(x[7], x[6], x[5], x[4]),
+                  .hi = _mm_set_epi32(x[11], x[10], x[9], x[8]) };
+    return xvec;
+}
+#endif
+
+#if defined(ARCH_64_BIT)
+static really_inline
+m384 loadcompressed384_64bit(const void *ptr, m384 mvec) {
+    // First, decompose our vectors into 64-bit chunks.
+    u64a m[6];
+    memcpy(m, &mvec, sizeof(mvec));
+
+    u32 bits[6] = { popcount64(m[0]), popcount64(m[1]),
+                    popcount64(m[2]), popcount64(m[3]),
+                    popcount64(m[4]), popcount64(m[5]) };
+    u64a v[6];
+
+    unpack_bits_64(v, (const u8 *)ptr, bits, 6);
+
+    u64a x[6] = { expand64(v[0], m[0]), expand64(v[1], m[1]),
+                  expand64(v[2], m[2]), expand64(v[3], m[3]),
+                  expand64(v[4], m[4]), expand64(v[5], m[5]) };
+
+    m384 xvec = { .lo = _mm_set_epi64x(x[1], x[0]),
+                  .mid = _mm_set_epi64x(x[3], x[2]),
+                  .hi = _mm_set_epi64x(x[5], x[4]) };
+    return xvec;
+}
+#endif
+
+void loadcompressed384(m384 *x, const void *ptr, const m384 *m,
+                       UNUSED u32 bytes) {
+#if defined(ARCH_64_BIT)
+    *x = loadcompressed384_64bit(ptr, *m);
+#else
+    *x = loadcompressed384_32bit(ptr, *m);
+#endif
+}
+
+/*
+ * 512-bit store/load.
+ */
+
+#if defined(ARCH_32_BIT)
+static really_inline
+void storecompressed512_32bit(void *ptr, m512 xvec, m512 mvec) {
+    // First, decompose our vectors into 32-bit chunks.
+    u32 x[16];
+    memcpy(x, &xvec, sizeof(xvec));
+    u32 m[16];
+    memcpy(m, &mvec, sizeof(mvec));
+
+    // Count the number of bits of compressed state we're writing out per
+    // chunk.
+    u32 bits[16] = { popcount32(m[0]), popcount32(m[1]),
+                     popcount32(m[2]), popcount32(m[3]),
+                     popcount32(m[4]), popcount32(m[5]),
+                     popcount32(m[6]), popcount32(m[7]),
+                     popcount32(m[8]), popcount32(m[9]),
+                     popcount32(m[10]), popcount32(m[11]),
+                     popcount32(m[12]), popcount32(m[13]),
+                     popcount32(m[14]), popcount32(m[15])};
+
+    // Compress each 32-bit chunk individually.
+    u32 v[16] = { compress32(x[0], m[0]), compress32(x[1], m[1]),
+                  compress32(x[2], m[2]), compress32(x[3], m[3]),
+                  compress32(x[4], m[4]), compress32(x[5], m[5]),
+                  compress32(x[6], m[6]), compress32(x[7], m[7]),
+                  compress32(x[8], m[8]), compress32(x[9], m[9]),
+                  compress32(x[10], m[10]), compress32(x[11], m[11]),
+                  compress32(x[12], m[12]), compress32(x[13], m[13]),
+                  compress32(x[14], m[14]), compress32(x[15], m[15]) };
+
+    // Write packed data out.
+    pack_bits_32(ptr, v, bits, 16);
+}
+#endif
+
+#if defined(ARCH_64_BIT)
+static really_inline
+void storecompressed512_64bit(void *ptr, m512 xvec, m512 mvec) {
+    // First, decompose our vectors into 64-bit chunks.
+    u64a m[8];
+    memcpy(m, &mvec, sizeof(mvec));
+    u64a x[8];
+    memcpy(x, &xvec, sizeof(xvec));
+
+    // Count the number of bits of compressed state we're writing out per
+    // chunk.
+    u32 bits[8] = { popcount64(m[0]), popcount64(m[1]),
+                    popcount64(m[2]), popcount64(m[3]),
+                    popcount64(m[4]), popcount64(m[5]),
+                    popcount64(m[6]), popcount64(m[7]) };
+
+    // Compress each 64-bit chunk individually.
+    u64a v[8] = { compress64(x[0], m[0]), compress64(x[1], m[1]),
+                  compress64(x[2], m[2]), compress64(x[3], m[3]),
+                  compress64(x[4], m[4]), compress64(x[5], m[5]),
+                  compress64(x[6], m[6]), compress64(x[7], m[7]) };
+
+    // Write packed data out.
+    pack_bits_64(ptr, v, bits, 8);
+}
+#endif
+
+void storecompressed512(void *ptr, const m512 *x, const m512 *m,
+                        UNUSED u32 bytes) {
+#if defined(ARCH_64_BIT)
+    storecompressed512_64bit(ptr, *x, *m);
+#else
+    storecompressed512_32bit(ptr, *x, *m);
+#endif
+}
+
+#if defined(ARCH_32_BIT)
+static really_inline
+m512 loadcompressed512_32bit(const void *ptr, m512 mvec) {
+    // First, decompose our vectors into 32-bit chunks.
+    u32 m[16];
+    memcpy(m, &mvec, sizeof(mvec));
+
+    u32 bits[16] = { popcount32(m[0]), popcount32(m[1]),
+                     popcount32(m[2]), popcount32(m[3]),
+                     popcount32(m[4]), popcount32(m[5]),
+                     popcount32(m[6]), popcount32(m[7]),
+                     popcount32(m[8]), popcount32(m[9]),
+                     popcount32(m[10]), popcount32(m[11]),
+                     popcount32(m[12]), popcount32(m[13]),
+                     popcount32(m[14]), popcount32(m[15]) };
+    u32 v[16];
+
+    unpack_bits_32(v, (const u8 *)ptr, bits, 16);
+
+    u32 x[16] = { expand32(v[0], m[0]), expand32(v[1], m[1]),
+                  expand32(v[2], m[2]), expand32(v[3], m[3]),
+                  expand32(v[4], m[4]), expand32(v[5], m[5]),
+                  expand32(v[6], m[6]), expand32(v[7], m[7]),
+                  expand32(v[8], m[8]), expand32(v[9], m[9]),
+                  expand32(v[10], m[10]), expand32(v[11], m[11]),
+                  expand32(v[12], m[12]), expand32(v[13], m[13]),
+                  expand32(v[14], m[14]), expand32(v[15], m[15]) };
+
+    m512 xvec;
+#if defined(HAVE_AVX512)
+    xvec = _mm512_set_epi32(x[15], x[14], x[13], x[12],
+                            x[11], x[10], x[9], x[8],
+                            x[7], x[6], x[5], x[4],
+                            x[3], x[2], x[1], x[0]);
+#elif defined(HAVE_AVX2)
+    xvec.lo = _mm256_set_epi32(x[7], x[6], x[5], x[4],
+                               x[3], x[2], x[1], x[0]);
+    xvec.hi = _mm256_set_epi32(x[15], x[14], x[13], x[12],
+                               x[11], x[10], x[9], x[8]);
+#else
+    xvec.lo.lo = _mm_set_epi32(x[3], x[2], x[1], x[0]);
+    xvec.lo.hi = _mm_set_epi32(x[7], x[6], x[5], x[4]);
+    xvec.hi.lo = _mm_set_epi32(x[11], x[10], x[9], x[8]);
+    xvec.hi.hi = _mm_set_epi32(x[15], x[14], x[13], x[12]);
+#endif
+    return xvec;
+}
+#endif
+
+#if defined(ARCH_64_BIT)
+static really_inline
+m512 loadcompressed512_64bit(const void *ptr, m512 mvec) {
+    // First, decompose our vectors into 64-bit chunks.
+    u64a m[8];
+    memcpy(m, &mvec, sizeof(mvec));
+
+    u32 bits[8] = { popcount64(m[0]), popcount64(m[1]),
+                    popcount64(m[2]), popcount64(m[3]),
+                    popcount64(m[4]), popcount64(m[5]),
+                    popcount64(m[6]), popcount64(m[7]) };
+    u64a v[8];
+
+    unpack_bits_64(v, (const u8 *)ptr, bits, 8);
+
+    u64a x[8] = { expand64(v[0], m[0]), expand64(v[1], m[1]),
+                  expand64(v[2], m[2]), expand64(v[3], m[3]),
+                  expand64(v[4], m[4]), expand64(v[5], m[5]),
+                  expand64(v[6], m[6]), expand64(v[7], m[7]) };
+
+#if defined(HAVE_AVX512)
+    m512 xvec = _mm512_set_epi64(x[7], x[6], x[5], x[4],
+                                 x[3], x[2], x[1], x[0]);
+#elif defined(HAVE_AVX2)
+    m512 xvec = { .lo = _mm256_set_epi64x(x[3], x[2], x[1], x[0]),
+                  .hi = _mm256_set_epi64x(x[7], x[6], x[5], x[4])};
+#else
+    m512 xvec = { .lo = { _mm_set_epi64x(x[1], x[0]),
+                          _mm_set_epi64x(x[3], x[2]) },
+                  .hi = { _mm_set_epi64x(x[5], x[4]),
+                          _mm_set_epi64x(x[7], x[6]) } };
+#endif
+    return xvec;
+}
+#endif
+
+void loadcompressed512(m512 *x, const void *ptr, const m512 *m,
+                       UNUSED u32 bytes) {
+#if defined(ARCH_64_BIT)
+    *x = loadcompressed512_64bit(ptr, *m);
+#else
+    *x = loadcompressed512_32bit(ptr, *m);
+#endif
+}
diff --git a/regex/util/state_compress.h b/regex/util/state_compress.h
new file mode 100644
index 000000000..a17d2355c
--- /dev/null
+++ b/regex/util/state_compress.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Mask-based state compression, used by the NFA.
+ */
+
+#ifndef STATE_COMPRESS_H
+#define STATE_COMPRESS_H
+
+#include "simd_utils.h"
+#include "ue2common.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/* Note: bytes is not used by implementations >= 128 */
+
+void storecompressed32(void *ptr, const u32 *x, const u32 *m, u32 bytes);
+void loadcompressed32(u32 *x, const void *ptr, const u32 *m, u32 bytes);
+
+void storecompressed64(void *ptr, const u64a *x, const u64a *m, u32 bytes);
+void loadcompressed64(u64a *x, const void *ptr, const u64a *m, u32 bytes);
+
+void storecompressed128(void *ptr, const m128 *x, const m128 *m, u32 bytes);
+void loadcompressed128(m128 *x, const void *ptr, const m128 *m, u32 bytes);
+
+void storecompressed256(void *ptr, const m256 *x, const m256 *m, u32 bytes);
+void loadcompressed256(m256 *x, const void *ptr, const m256 *m, u32 bytes);
+
+void storecompressed384(void *ptr, const m384 *x, const m384 *m, u32 bytes);
+void loadcompressed384(m384 *x, const void *ptr, const m384 *m, u32 bytes);
+
+void storecompressed512(void *ptr, const m512 *x, const m512 *m, u32 bytes);
+void loadcompressed512(m512 *x, const void *ptr, const m512 *m, u32 bytes);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif
diff --git a/regex/util/unaligned.h b/regex/util/unaligned.h
new file mode 100644
index 000000000..299e5677c
--- /dev/null
+++ b/regex/util/unaligned.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Helper functions for unaligned loads and stores.
+ */
+
+#ifndef UNALIGNED_H
+#define UNALIGNED_H
+
+#include "ue2common.h"
+
+#if !defined(_WIN32)
+#define PACKED__MAY_ALIAS __attribute__((packed, may_alias))
+#else
+#define PACKED__MAY_ALIAS
+#pragma pack(push, 1) // pack everything until told otherwise
+#endif
+
+/// Perform an unaligned 16-bit load
+static really_inline
+u16 unaligned_load_u16(const void *ptr) {
+    struct unaligned { u16 u; } PACKED__MAY_ALIAS;
+    const struct unaligned *uptr = (const struct unaligned *)ptr;
+    return uptr->u;
+}
+
+/// Perform an unaligned 32-bit load
+static really_inline
+u32 unaligned_load_u32(const void *ptr) {
+    struct unaligned { u32 u; } PACKED__MAY_ALIAS;
+    const struct unaligned *uptr = (const struct unaligned *)ptr;
+    return uptr->u;
+}
+
+/// Perform an unaligned 64-bit load
+static really_inline
+u64a unaligned_load_u64a(const void *ptr) {
+    struct unaligned { u64a u; } PACKED__MAY_ALIAS;
+    const struct unaligned *uptr = (const struct unaligned *)ptr;
+    return uptr->u;
+}
+
+/// Perform an unaligned 16-bit store
+static really_inline
+void unaligned_store_u16(void *ptr, u16 val) {
+    struct unaligned { u16 u; } PACKED__MAY_ALIAS;
+    struct unaligned *uptr = (struct unaligned *)ptr;
+    uptr->u = val;
+}
+
+/// Perform an unaligned 32-bit store
+static really_inline
+void unaligned_store_u32(void *ptr, u32 val) {
+    struct unaligned { u32 u; } PACKED__MAY_ALIAS;
+    struct unaligned *uptr = (struct unaligned *)ptr;
+    uptr->u = val;
+}
+
+/// Perform an unaligned 64-bit store
+static really_inline
+void unaligned_store_u64a(void *ptr, u64a val) {
+    struct unaligned { u64a u; } PACKED__MAY_ALIAS;
+    struct unaligned *uptr = (struct unaligned *)ptr;
+    uptr->u = val;
+}
+#if defined(_WIN32)
+#pragma pack(pop)
+#endif // win32
+
+#undef PACKED__MAY_ALIAS
+
+#endif // UNALIGNED_H
diff --git a/regex/util/uniform_ops.h b/regex/util/uniform_ops.h
new file mode 100644
index 000000000..262104aca
--- /dev/null
+++ b/regex/util/uniform_ops.h
@@ -0,0 +1,243 @@
+/*
+ * Copyright (c) 2015-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Uniformly-named primitives named by target type.
+ *
+ * The following are a set of primitives named by target type, so that we can
+ * macro the hell out of all our NFA implementations. Hurrah!
+ */
+
+#ifndef UNIFORM_OPS_H
+#define UNIFORM_OPS_H
+
+#include "ue2common.h"
+#include "simd_utils.h"
+#include "unaligned.h"
+
+// Aligned loads
+#define load_u8(a)          (*(const u8 *)(a))
+#define load_u16(a)         (*(const u16 *)(a))
+#define load_u32(a)         (*(const u32 *)(a))
+#define load_u64a(a)        (*(const u64a *)(a))
+#define load_m128(a)        load128(a)
+#define load_m256(a)        load256(a)
+#define load_m384(a)        load384(a)
+#define load_m512(a)        load512(a)
+
+// Unaligned loads
+#define loadu_u8(a)          (*(const u8 *)(a))
+#define loadu_u16(a)         unaligned_load_u16((const u8 *)(a))
+#define loadu_u32(a)         unaligned_load_u32((const u8 *)(a))
+#define loadu_u64a(a)        unaligned_load_u64a((const u8 *)(a))
+#define loadu_m128(a)        loadu128(a)
+#define loadu_m256(a)        loadu256(a)
+#define loadu_m384(a)        loadu384(a)
+#define loadu_m512(a)        loadu512(a)
+
+// Aligned stores
+#define store_u8(ptr, a)    do { *(u8 *)(ptr) = (a); } while(0)
+#define store_u16(ptr, a)   do { *(u16 *)(ptr) = (a); } while(0)
+#define store_u32(ptr, a)   do { *(u32 *)(ptr) = (a); } while(0)
+#define store_u64a(ptr, a)  do { *(u64a *)(ptr) = (a); } while(0)
+#define store_m128(ptr, a)  store128(ptr, a)
+#define store_m256(ptr, a)  store256(ptr, a)
+#define store_m384(ptr, a)  store384(ptr, a)
+#define store_m512(ptr, a)  store512(ptr, a)
+
+// Unaligned stores
+#define storeu_u8(ptr, a)    do { *(u8 *)(ptr) = (a); } while(0)
+#define storeu_u16(ptr, a)   unaligned_store_u16(ptr, a)
+#define storeu_u32(ptr, a)   unaligned_store_u32(ptr, a)
+#define storeu_u64a(ptr, a)  unaligned_store_u64a(ptr, a)
+#define storeu_m128(ptr, a)  storeu128(ptr, a)
+
+#define zero_u8             0
+#define zero_u32            0
+#define zero_u64a           0
+#define zero_m128           zeroes128()
+#define zero_m256           zeroes256()
+#define zero_m384           zeroes384()
+#define zero_m512           zeroes512()
+
+#define ones_u8             0xff
+#define ones_u32            0xfffffffful
+#define ones_u64a           0xffffffffffffffffull
+#define ones_m128           ones128()
+#define ones_m256           ones256()
+#define ones_m384           ones384()
+#define ones_m512           ones512()
+
+#define or_u8(a, b)         ((a) | (b))
+#define or_u32(a, b)        ((a) | (b))
+#define or_u64a(a, b)       ((a) | (b))
+#define or_m128(a, b)       (or128(a, b))
+#define or_m256(a, b)       (or256(a, b))
+#define or_m384(a, b)       (or384(a, b))
+#define or_m512(a, b)       (or512(a, b))
+
+#if defined(HAVE_AVX512VBMI)
+#define expand_m128(a)      (expand128(a))
+#define expand_m256(a)      (expand256(a))
+#define expand_m384(a)      (expand384(a))
+#define expand_m512(a)      (a)
+
+#define shuffle_byte_m128(a, b)       (pshufb_m512(b, a))
+#define shuffle_byte_m256(a, b)       (vpermb512(a, b))
+#define shuffle_byte_m384(a, b)       (vpermb512(a, b))
+#define shuffle_byte_m512(a, b)       (vpermb512(a, b))
+#endif
+
+#define and_u8(a, b)        ((a) & (b))
+#define and_u32(a, b)       ((a) & (b))
+#define and_u64a(a, b)      ((a) & (b))
+#define and_m128(a, b)      (and128(a, b))
+#define and_m256(a, b)      (and256(a, b))
+#define and_m384(a, b)      (and384(a, b))
+#define and_m512(a, b)      (and512(a, b))
+
+#define not_u8(a)           (~(a))
+#define not_u32(a)          (~(a))
+#define not_u64a(a)         (~(a))
+#define not_m128(a)         (not128(a))
+#define not_m256(a)         (not256(a))
+#define not_m384(a)         (not384(a))
+#define not_m512(a)         (not512(a))
+
+#define andnot_u8(a, b)     ((~(a)) & (b))
+#define andnot_u32(a, b)    ((~(a)) & (b))
+#define andnot_u64a(a, b)   ((~(a)) & (b))
+#define andnot_m128(a, b)   (andnot128(a, b))
+#define andnot_m256(a, b)   (andnot256(a, b))
+#define andnot_m384(a, b)   (andnot384(a, b))
+#define andnot_m512(a, b)   (andnot512(a, b))
+
+#define lshift_u32(a, b)    ((a) << (b))
+#define lshift_u64a(a, b)   ((a) << (b))
+#define lshift_m128(a, b)   (lshift64_m128(a, b))
+#define lshift_m256(a, b)   (lshift64_m256(a, b))
+#define lshift_m384(a, b)   (lshift64_m384(a, b))
+#define lshift_m512(a, b)   (lshift64_m512(a, b))
+
+#define isZero_u8(a)        ((a) == 0)
+#define isZero_u32(a)       ((a) == 0)
+#define isZero_u64a(a)      ((a) == 0)
+#define isZero_m128(a)      (!isnonzero128(a))
+#define isZero_m256(a)      (!isnonzero256(a))
+#define isZero_m384(a)      (!isnonzero384(a))
+#define isZero_m512(a)      (!isnonzero512(a))
+
+#define isNonZero_u8(a)     ((a) != 0)
+#define isNonZero_u32(a)    ((a) != 0)
+#define isNonZero_u64a(a)   ((a) != 0)
+#define isNonZero_m128(a)   (isnonzero128(a))
+#define isNonZero_m256(a)   (isnonzero256(a))
+#define isNonZero_m384(a)   (isnonzero384(a))
+#define isNonZero_m512(a)   (isnonzero512(a))
+
+#define diffrich_u32(a, b)  ((a) != (b))
+#define diffrich_u64a(a, b) ((a) != (b) ? 3 : 0) //TODO: impl 32bit granularity
+#define diffrich_m128(a, b) (diffrich128(a, b))
+#define diffrich_m256(a, b) (diffrich256(a, b))
+#define diffrich_m384(a, b) (diffrich384(a, b))
+#define diffrich_m512(a, b) (diffrich512(a, b))
+
+#define diffrich64_u32(a, b)  ((a) != (b))
+#define diffrich64_u64a(a, b) ((a) != (b) ? 1 : 0)
+#define diffrich64_m128(a, b) (diffrich64_128(a, b))
+#define diffrich64_m256(a, b) (diffrich64_256(a, b))
+#define diffrich64_m384(a, b) (diffrich64_384(a, b))
+#define diffrich64_m512(a, b) (diffrich64_512(a, b))
+
+#define noteq_u8(a, b)      ((a) != (b))
+#define noteq_u32(a, b)     ((a) != (b))
+#define noteq_u64a(a, b)    ((a) != (b))
+#define noteq_m128(a, b)    (diff128(a, b))
+#define noteq_m256(a, b)    (diff256(a, b))
+#define noteq_m384(a, b)    (diff384(a, b))
+#define noteq_m512(a, b)    (diff512(a, b))
+
+#define partial_store_m128(ptr, v, sz) storebytes128(ptr, v, sz)
+#define partial_store_m256(ptr, v, sz) storebytes256(ptr, v, sz)
+#define partial_store_m384(ptr, v, sz) storebytes384(ptr, v, sz)
+#define partial_store_m512(ptr, v, sz) storebytes512(ptr, v, sz)
+
+#define partial_load_m128(ptr, sz) loadbytes128(ptr, sz)
+#define partial_load_m256(ptr, sz) loadbytes256(ptr, sz)
+#define partial_load_m384(ptr, sz) loadbytes384(ptr, sz)
+#define partial_load_m512(ptr, sz) loadbytes512(ptr, sz)
+
+#define store_compressed_u32(ptr, x, m, len)  storecompressed32(ptr, x, m, len)
+#define store_compressed_u64a(ptr, x, m, len) storecompressed64(ptr, x, m, len)
+#define store_compressed_m128(ptr, x, m, len) storecompressed128(ptr, x, m, len)
+#define store_compressed_m256(ptr, x, m, len) storecompressed256(ptr, x, m, len)
+#define store_compressed_m384(ptr, x, m, len) storecompressed384(ptr, x, m, len)
+#define store_compressed_m512(ptr, x, m, len) storecompressed512(ptr, x, m, len)
+
+#define load_compressed_u32(x, ptr, m, len)   loadcompressed32(x, ptr, m, len)
+#define load_compressed_u64a(x, ptr, m, len)  loadcompressed64(x, ptr, m, len)
+#define load_compressed_m128(x, ptr, m, len)  loadcompressed128(x, ptr, m, len)
+#define load_compressed_m256(x, ptr, m, len)  loadcompressed256(x, ptr, m, len)
+#define load_compressed_m384(x, ptr, m, len)  loadcompressed384(x, ptr, m, len)
+#define load_compressed_m512(x, ptr, m, len)  loadcompressed512(x, ptr, m, len)
+
+static really_inline
+void clearbit_u32(u32 *p, u32 n) {
+    assert(n < sizeof(*p) * 8);
+    *p &= ~(1U << n);
+}
+
+static really_inline
+void clearbit_u64a(u64a *p, u32 n) {
+    assert(n < sizeof(*p) * 8);
+    *p &= ~(1ULL << n);
+}
+
+#define clearbit_m128(ptr, n)   (clearbit128(ptr, n))
+#define clearbit_m256(ptr, n)   (clearbit256(ptr, n))
+#define clearbit_m384(ptr, n)   (clearbit384(ptr, n))
+#define clearbit_m512(ptr, n)   (clearbit512(ptr, n))
+
+static really_inline
+char testbit_u32(u32 val, u32 n) {
+    assert(n < sizeof(val) * 8);
+    return !!(val & (1U << n));
+}
+
+static really_inline
+char testbit_u64a(u64a val, u32 n) {
+    assert(n < sizeof(val) * 8);
+    return !!(val & (1ULL << n));
+}
+
+#define testbit_m128(val, n)    (testbit128(val, n))
+#define testbit_m256(val, n)    (testbit256(val, n))
+#define testbit_m384(val, n)    (testbit384(val, n))
+#define testbit_m512(val, n)    (testbit512(val, n))
+
+#endif
diff --git a/scripts/install_regex.sh b/scripts/install_regex.sh
new file mode 100755
index 000000000..066065f47
--- /dev/null
+++ b/scripts/install_regex.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+INST_DIR="/tmp/Hyperscan"
+mkdir $INST_DIR
+cd $INST_DIR
+
+git clone https://github.com/adrian-thurston/colm.git
+cd colm
+./autogen.sh
+./configure
+make -j$(nproc)
+make install
+
+cd $INST_DIR
+
+if [[ $LD_LIBRARY_PATH =~ "/usr/local/lib" ]]; then
+    echo "Path already set."
+else
+    export LD_LIBRARY_PATH="/usr/local/lib"
+    TTT="$(cat /etc/environment | grep LD_LIBRARY_PATH)"
+    if [[ ! $TTT =~ "/usr/local/lib" ]]; then
+        echo "LD_LIBRARY_PATH=\"/usr/local/lib\"" >> /etc/environment
+    fi
+    
+fi
+
+
+git clone https://github.com/adrian-thurston/ragel.git
+cd ragel
+./autogen.sh
+./configure --with-colm=/usr/local
+make -j$(nproc)
+make install
+
+cd $INST_DIR
+
+wget https://sourceforge.net/projects/pcre/files/pcre/8.45/pcre-8.45.tar.gz
+tar -xf pcre-8.45.tar.gz
+
+cd pcre-8.45
+./configure  --enable-pcre16 --enable-pcre32
+make -j$(nproc)
+make install
+
+cd $INST_DIR
+
+git clone https://github.com/tempesta-tech/linux-regex-module.git
+cd linux-regex-module
+git checkout ag_changes_for_easy_installation
+
+cmake -DCMAKE_BUILD_TYPE=Release ./
+make -j$(nproc)
+make install
+
+cd $INST_DIR
+
diff --git a/scripts/regex_start.sh b/scripts/regex_start.sh
new file mode 100755
index 000000000..0b25320c7
--- /dev/null
+++ b/scripts/regex_start.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+rmdir /sys/kernel/config/rex/* 2> /dev/null
+
+script_path="$(dirname $0)"
+tmp_path="/tmp/tempesta"
+
+echo "Start compilation of regex." > /dev/kmsg
+
+for filename in ${tmp_path}/*.txt; do
+    name=$(basename "$filename" .txt)
+    if [[ "$name" != "*" ]]; then
+        db_path="/sys/kernel/config/rex/${name}"
+
+        rm -rf ${tmp_path}/out/ && mkdir ${tmp_path}/out
+        #${script_path}/hscollider -e ${filename} -ao ${tmp_path}/out/ -n1 #this version for single block strings
+        #${script_path}/hscollider -e ${filename} -V5 -ao ${tmp_path}/out/ -n1 #this version starts hscollider from scripts directory
+        hscollider -e ${filename} -V5 -ao ${tmp_path}/out/ -n1
+
+        mkdir $db_path
+        dd if=$(echo ${tmp_path}/out/*.db) of=${db_path}/database
+        cat "${filename}" > ${db_path}/note
+        echo "$name" > ${db_path}/id
+    fi
+done
+
+echo "Compilation of regex files is complete." > /dev/kmsg
+
diff --git a/scripts/regex_stop.sh b/scripts/regex_stop.sh
new file mode 100755
index 000000000..740ab8cc4
--- /dev/null
+++ b/scripts/regex_stop.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+script_path="$(dirname $0)"
+
+rm -f /tmp/tempesta/*.txt
+rm -rf /tmp/tempesta/out
+#rmdir -p /sys/kernel/config/rex/*
+rmdir /sys/kernel/config/rex/*
\ No newline at end of file
diff --git a/scripts/tempesta.sh b/scripts/tempesta.sh
index e46ad6c0a..a9ed7f7df 100755
--- a/scripts/tempesta.sh
+++ b/scripts/tempesta.sh
@@ -35,6 +35,7 @@ fi
 
 script_path="$(dirname $0)"
 tdb_path=${TDB_PATH:="$TFW_ROOT/db/core"}
+rgx_path=${REGEX_PATH:="$TFW_ROOT/regex"}
 tfw_path=${TFW_PATH:="$TFW_ROOT/fw"}
 tls_path=${TLS_PATH:="$TFW_ROOT/tls"}
 lib_path=${LIB_PATH:="$TFW_ROOT/lib"}
@@ -45,6 +46,7 @@ lib_mod=tempesta_lib
 tls_mod=tempesta_tls
 tdb_mod=tempesta_db
 tfw_mod=tempesta_fw
+rgx_mod=xdp_rex
 declare -r LONG_OPTS="help,load,unload,start,stop,restart,reload"
 
 # Exclude loopback interface since it needn't any tuning here: it hasn't RSS
@@ -131,6 +133,7 @@ load_modules()
 {
 	echo "Loading Tempesta kernel modules..."
 
+	mkdir /tmp/tempesta
 	# Set verbose kernel logging,
 	# so debug messages are shown on serial console as well.
 	echo '8 7 1 7' > /proc/sys/kernel/printk
@@ -144,6 +147,9 @@ load_modules()
 	load_one_module "$tdb_path/$tdb_mod.ko" ||
 		error "cannot load tempesta database module"
 
+	load_one_module "$rgx_path/$rgx_mod.ko" ||
+		error "cannot load regex module"
+
 	load_one_module "$tfw_path/$tfw_mod.ko" "tfw_cfg_path=$tfw_cfg_temp" ||
 		error "cannot load tempesta module"
 }
@@ -153,6 +159,8 @@ unload_modules()
 	echo "Un-loading Tempesta kernel modules..."
 
 	rmmod $tfw_mod
+	$script_path/regex_stop.sh
+	rmmod $rgx_mod
 	rmmod $tdb_mod
 	rmmod $tls_mod
 	rmmod $lib_mod
@@ -284,6 +292,7 @@ start()
 		unload_modules
 		error "cannot start Tempesta FW (sysctl message: ${err##*: }), please check dmesg"
 	else
+		$script_path/regex_start.sh
 		echo "done"
 	fi
 	remove_tmp_conf
@@ -307,10 +316,12 @@ reload()
 {
 	update_js_challenge_templates
 	echo "Running live reconfiguration of Tempesta..."
+	$script_path/regex_stop.sh
 	err=$(start_tempesta_and_check)
 	if [[ $err != "0" ]]; then
 		error "cannot reconfigure Tempesta FW (sysctl message: ${err##*: }), please check dmesg"
 	else
+		$script_path/regex_start.sh
 		echo "done"
 		remove_tmp_conf
 	fi