Skip to content

Commit

Permalink
gt: add torus optimization to optimized GT multiexp
Browse files Browse the repository at this point in the history
  • Loading branch information
mratsim committed Nov 3, 2024
1 parent 45bac91 commit 55c5d38
Show file tree
Hide file tree
Showing 5 changed files with 133 additions and 43 deletions.
4 changes: 3 additions & 1 deletion benchmarks/bench_gt_multiexp_bls12_381.nim
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ const AvailableCurves = [
BLS12_381,
]

const testNumPoints = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
# const testNumPoints = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
const testNumPoints = [128, 256]


type Fp12over4[C: static Algebra] = CubicExt[Fp4[C]]
type Fp12over6[C: static Algebra] = QuadraticExt[Fp6[C]]
Expand Down
53 changes: 40 additions & 13 deletions benchmarks/bench_gt_parallel_template.nim
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,9 @@ proc report(op, domain: string, start, stop: MonoTime, startClk, stopClk: int64,
let ns = inNanoseconds((stop-start) div iters)
let throughput = 1e9 / float64(ns)
when SupportsGetTicks:
echo &"{op:<55} {domain:<20} {throughput:>15.3f} ops/s {ns:>9} ns/op {(stopClk - startClk) div iters:>9} CPU cycles (approx)"
echo &"{op:<65} {domain:<20} {throughput:>15.3f} ops/s {ns:>9} ns/op {(stopClk - startClk) div iters:>9} CPU cycles (approx)"
else:
echo &"{op:<55} {domain:<20} {throughput:>15.3f} ops/s {ns:>9} ns/op"
echo &"{op:<65} {domain:<20} {throughput:>15.3f} ops/s {ns:>9} ns/op"

macro fixFieldDisplay(T: typedesc): untyped =
# At compile-time, enums are integers and their display is buggy
Expand Down Expand Up @@ -126,11 +126,14 @@ proc multiExpParallelBench*[GT](ctx: var BenchMultiExpContext[GT], numInputs: in

var r{.noInit.}: GT
var startNaive, stopNaive, startMultiExpBaseline, stopMultiExpBaseline: MonoTime
var startMultiExpOpt, stopMultiExpOpt, startMultiExpPara, stopMultiExpPara: MonoTime
var startMultiExpOptEndo, stopMultiExpOptEndo, startMultiExpPara, stopMultiExpPara: MonoTime
var startMultiExpOptNoEndo, stopMultiExpOptNoEndo: Monotime

when GT is QuadraticExt:
var startMultiExpBaselineTorus: MonoTime
var stopMultiExpBaselineTorus: MonoTime
var startMultiExpOptTorusNoEndo: MonoTime
var stopMultiExpOptTorusNoEndo: Monotime

if numInputs <= 100000:
# startNaive = getMonotime()
Expand Down Expand Up @@ -166,10 +169,23 @@ proc multiExpParallelBench*[GT](ctx: var BenchMultiExpContext[GT], numInputs: in
stopMultiExpBaselineTorus = getMonotime()

block:
startMultiExpOpt = getMonotime()
bench("𝔾ₜ multi-exp optimized " & align($numInputs, 10) & " (" & $bits & "-bit exponents)", GT, iters):
r.multiExp_vartime(elems, exponents)
stopMultiExpOpt = getMonotime()
startMultiExpOptNoEndo = getMonotime()
bench("𝔾ₜ multi-exp opt no endo " & align($numInputs, 10) & " (" & $bits & "-bit exponents)", GT, iters):
r.multiExp_vartime(elems, exponents, useEndo = false, useTorus = false)
stopMultiExpOptNoEndo = getMonotime()

block:
startMultiExpOptEndo = getMonotime()
bench("𝔾ₜ multi-exp opt + endo " & align($numInputs, 10) & " (" & $bits & "-bit exponents)", GT, iters):
r.multiExp_vartime(elems, exponents, useEndo = true, useTorus = false)
stopMultiExpOptEndo = getMonotime()

when GT is QuadraticExt:
block:
startMultiExpOptTorusNoEndo = getMonotime()
bench("𝔾ₜ multiexp opt+torus no endo" & align($numInputs, 10) & " (" & $bits & "-bit exponents)", GT, iters):
r.multiExp_vartime(elems, exponents, useEndo = false, useTorus = true)
stopMultiExpOptTorusNoEndo = getMonotime()

block:
ctx.tp = Threadpool.new()
Expand All @@ -183,24 +199,35 @@ proc multiExpParallelBench*[GT](ctx: var BenchMultiExpContext[GT], numInputs: in

let perfNaive = inNanoseconds((stopNaive-startNaive) div iters)
let perfMultiExpBaseline = inNanoseconds((stopMultiExpBaseline-startMultiExpBaseline) div iters)
let perfMultiExpOpt = inNanoseconds((stopMultiExpOpt-startMultiExpOpt) div iters)
let perfMultiExpOptNoEndo = inNanoseconds((stopMultiExpOptNoEndo-startMultiExpOptNoEndo) div iters)
let perfMultiExpOptEndo = inNanoseconds((stopMultiExpOptEndo-startMultiExpOptEndo) div iters)
let perfMultiExpPara = inNanoseconds((stopMultiExpPara-startMultiExpPara) div iters)
when GT is QuadraticExt:
let perfMultiExpBaselineTorus = inNanoseconds((stopMultiExpBaselineTorus-startMultiExpBaselineTorus) div iters)
let perfMultiExpOptTorusNoEndo = inNanoseconds((startMultiExpOptTorusNoEndo-stopMultiExpOptTorusNoEndo) div iters)

if numInputs <= 100000:
let speedupBaseline = float(perfNaive) / float(perfMultiExpBaseline)
echo &"Speedup ratio baseline over naive linear combination: {speedupBaseline:>6.3f}x"

let speedupOpt = float(perfNaive) / float(perfMultiExpOpt)
let speedupOpt = float(perfNaive) / float(perfMultiExpOptNoEndo)
echo &"Speedup ratio optimized over naive linear combination: {speedupOpt:>6.3f}x"

let speedupOptBaseline = float(perfMultiExpBaseline) / float(perfMultiExpOpt)
echo &"Speedup ratio optimized over baseline linear combination: {speedupOptBaseline:>6.3f}x"
let speedupOptBaseline = float(perfMultiExpBaseline) / float(perfMultiExpOptNoEndo)
echo &"Speedup ratio optimized no endomorphism over baseline linear combination: {speedupOptBaseline:>6.3f}x"

let speedupOptEndo = float(perfNaive) / float(perfMultiExpOptNoEndo)
echo &"Speedup ratio optimized+endomorphism over naive linear combination: {speedupOptEndo:>6.3f}x"

let speedupOptEndoOpt = float(perfMultiExpOptNoEndo) / float(perfMultiExpOptEndo)
echo &"Speedup ratio optimized without/with endormorphism: {speedupOptEndoOpt:>6.3f}x"

when GT is QuadraticExt:
let speedupTorusOverBaseline = float(perfMultiExpBaseline) / float(perfMultiExpBaselineTorus)
echo &"Speedup ratio baseline+Torus over baseline linear combination: {speedupTorusOverBaseline:>6.3f}x"

let speedupParaOpt = float(perfMultiExpOpt) / float(perfMultiExpPara)
echo &"Speedup ratio parallel over optimized linear combination: {speedupParaOpt:>6.3f}x"
let speedupTorusOverOpt = float(perfMultiExpOptNoEndo) / float(perfMultiExpOptTorusNoEndo)
echo &"Speedup ratio optimized+Torus over optimized: {speedupTorusOverOpt:>6.3f}x"

let speedupParaOpt = float(perfMultiExpOptEndo) / float(perfMultiExpPara)
echo &"Speedup ratio parallel over optimized+endomorphism linear combination: {speedupParaOpt:>6.3f}x"
109 changes: 84 additions & 25 deletions constantine/math/pairings/gt_multiexp.nim
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ func `~*=`(a: var T2Prj, b: T2Prj) {.inline.} =
func `~/=`(a: var T2Prj, b: T2Aff) {.inline.} =
## Cyclotomic division
var t {.noInit.}: T2Aff
t.cyclotomic_inv(b)
t.inv(b)
a ~*= t

# Reference multi-exponentiation
Expand Down Expand Up @@ -212,7 +212,7 @@ func multiExp_reference_vartime*[bits: static int, Gt](
when useTorus:
static: doAssert Gt is QuadraticExt, "GT was: " & $Gt
type F = typeof(elems[0].c0)
let elemsTorus = allocHeapArrayAligned(T2Aff[F], N, alignment = 64)
var elemsTorus = allocHeapArrayAligned(T2Aff[F], N, alignment = 64)
elemsTorus.toOpenArray(0, N-1).batchFromGT_vartime(
elems.toOpenArray(0, N-1)
)
Expand Down Expand Up @@ -266,7 +266,7 @@ func multiExp_reference_vartime*[Gt](
# #
# ########################################################### #

func accumulate[GT](buckets: ptr UncheckedArray[GT], val: SecretWord, negate: SecretBool, elem: GT) {.inline, meter.} =
func accumulate[GtAcc, GtElt](buckets: ptr UncheckedArray[GtAcc], val: SecretWord, negate: SecretBool, elem: GtElt) {.inline, meter.} =
let val = BaseType(val)
if val == 0: # Skip g⁰
return
Expand All @@ -275,7 +275,7 @@ func accumulate[GT](buckets: ptr UncheckedArray[GT], val: SecretWord, negate: Se
else:
buckets[val-1] ~*= elem

func bucketReduce[GT](r: var GT, buckets: ptr UncheckedArray[GT], numBuckets: static int) {.meter.} =
func bucketReduce[GtAcc](r: var GtAcc, buckets: ptr UncheckedArray[GtAcc], numBuckets: static int) {.meter.} =
# We interleave reduction with one-ing the bucket to use instruction-level parallelism

var accumBuckets{.noInit.}: typeof(r)
Expand All @@ -293,11 +293,11 @@ type MiniMultiExpKind* = enum
kFullWindow
kBottomWindow

func bucketAccumReduce[bits: static int, GT](
r: var GT,
buckets: ptr UncheckedArray[GT],
func bucketAccumReduce[bits: static int, GtAcc, GtElt](
r: var GtAcc,
buckets: ptr UncheckedArray[GtAcc],
bitIndex: int, miniMultiExpKind: static MiniMultiExpKind, c: static int,
elems: ptr UncheckedArray[GT], expos: ptr UncheckedArray[BigInt[bits]], N: int) =
elems: ptr UncheckedArray[GtElt], expos: ptr UncheckedArray[BigInt[bits]], N: int) =

const excess = bits mod c
const top = bits - excess
Expand Down Expand Up @@ -326,11 +326,11 @@ func bucketAccumReduce[bits: static int, GT](
# 2. Bucket Reduction
r.bucketReduce(buckets, numBuckets = 1 shl (c-1))

func miniMultiExp[bits: static int, GT](
r: var GT,
buckets: ptr UncheckedArray[GT],
func miniMultiExp[bits: static int, GtAcc, GtElt](
r: var GtAcc,
buckets: ptr UncheckedArray[GtAcc],
bitIndex: int, miniMultiExpKind: static MiniMultiExpKind, c: static int,
elems: ptr UncheckedArray[GT], expos: ptr UncheckedArray[BigInt[bits]], N: int) {.meter.} =
elems: ptr UncheckedArray[GtElt], expos: ptr UncheckedArray[BigInt[bits]], N: int) {.meter.} =
## Apply a mini-Multi-Exponentiation on [bitIndex, bitIndex+window)
## slice of all (coef, point) pairs

Expand All @@ -345,9 +345,9 @@ func miniMultiExp[bits: static int, GT](
for _ in 0 ..< c:
r.cyclotomic_square()

func multiExpImpl_vartime[bits: static int, GT](
r: var GT,
elems: ptr UncheckedArray[GT], expos: ptr UncheckedArray[BigInt[bits]],
func multiExpImpl_vartime[bits: static int, GtAcc, GtElt](
r: var GtAcc,
elems: ptr UncheckedArray[GtElt], expos: ptr UncheckedArray[BigInt[bits]],
N: int, c: static int) {.tags:[VarTime, HeapAlloc], meter.} =
## Multiexponentiation:
## r <- g₀^a₀ + g₁^a₁ + ... + gₙ^aₙ
Expand All @@ -356,7 +356,7 @@ func multiExpImpl_vartime[bits: static int, GT](
# -----
const numBuckets = 1 shl (c-1)

let buckets = allocHeapArray(GT, numBuckets)
let buckets = allocHeapArray(GtAcc, numBuckets)
for i in 0 ..< numBuckets:
buckets[i].setNeutral()

Expand Down Expand Up @@ -449,7 +449,7 @@ template withEndo[exponentsBits: static int, GT](
# #
# ########################################################### #

func multiexp_dispatch_vartime[bits: static int, GT](
func multiexp_dispatch_endo_vartime[bits: static int, GT](
r: var GT,
elems: ptr UncheckedArray[GT],
expos: ptr UncheckedArray[BigInt[bits]], N: int) =
Expand All @@ -474,6 +474,43 @@ func multiexp_dispatch_vartime[bits: static int, GT](
of 11: withEndo(multiExpImpl_vartime, r, elems, expos, N, c = 11)
of 12: withEndo(multiExpImpl_vartime, r, elems, expos, N, c = 12)
of 13: withEndo(multiExpImpl_vartime, r, elems, expos, N, c = 13)
of 14: withEndo(multiExpImpl_vartime, r, elems, expos, N, c = 14)
of 15: withEndo(multiExpImpl_vartime, r, elems, expos, N, c = 15)

of 16..17: withEndo(multiExpImpl_vartime, r, elems, expos, N, c = 16)
else:
unreachable()

func multiexp_dispatch_no_endo_vartime[bits: static int, GtAcc, GtElt](
r: var GtAcc,
elems: ptr UncheckedArray[GtElt],
expos: ptr UncheckedArray[BigInt[bits]], N: int) =
## Multiexponentiation:
## r <- g₀^a₀ + g₁^a₁ + ... + gₙ^aₙ
##
## TODO: Currently endomorphism acceleration on a torus is not implemented,
## we have 2 approaches:
## - First convert to Torus then apply endomorphism acceleration
## - or apply endomorphism acceleration then convert to Torus
let c = bestBucketBitSize(N, bits, useSignedBuckets = true, useManualTuning = true)

# Given that bits and N change after applying an endomorphism,
# we are able to use a bigger `c`
# TODO: benchmark

case c
of 2: multiExpImpl_vartime(r, elems, expos, N, c = 2)
of 3: multiExpImpl_vartime(r, elems, expos, N, c = 3)
of 4: multiExpImpl_vartime(r, elems, expos, N, c = 4)
of 5: multiExpImpl_vartime(r, elems, expos, N, c = 5)
of 6: multiExpImpl_vartime(r, elems, expos, N, c = 6)
of 7: multiExpImpl_vartime(r, elems, expos, N, c = 7)
of 8: multiExpImpl_vartime(r, elems, expos, N, c = 8)
of 9: multiExpImpl_vartime(r, elems, expos, N, c = 9)
of 10: multiExpImpl_vartime(r, elems, expos, N, c = 10)
of 11: multiExpImpl_vartime(r, elems, expos, N, c = 11)
of 12: multiExpImpl_vartime(r, elems, expos, N, c = 12)
of 13: multiExpImpl_vartime(r, elems, expos, N, c = 13)
of 14: multiExpImpl_vartime(r, elems, expos, N, c = 14)
of 15: multiExpImpl_vartime(r, elems, expos, N, c = 15)

Expand All @@ -485,41 +522,63 @@ func multiExp_vartime*[bits: static int, GT](
r: var GT,
elems: ptr UncheckedArray[GT],
expos: ptr UncheckedArray[BigInt[bits]],
len: int) {.tags:[VarTime, Alloca, HeapAlloc], meter, inline.} =
len: int,
useEndo: static bool = true,
useTorus: static bool = false) {.tags:[VarTime, Alloca, HeapAlloc], meter, inline.} =
## Multiexponentiation:
## r <- g₀^a₀ + g₁^a₁ + ... + gₙ^aₙ
multiExp_dispatch_vartime(r, elems, expos, len)
when useTorus:
static: doAssert Gt is QuadraticExt, "GT was: " & $Gt
type F = typeof(elems[0].c0)
var elemsTorus = allocHeapArrayAligned(T2Aff[F], len, alignment = 64)
elemsTorus.toOpenArray(0, len-1).batchFromGT_vartime(
elems.toOpenArray(0, len-1)
)
var r_torus {.noInit.}: T2Prj[F]
multiexp_dispatch_no_endo_vartime(r_torus, elemsTorus, expos, len)
r.fromTorus2_vartime(r_torus)
else:
when useEndo:
multiExp_dispatch_endo_vartime(r, elems, expos, len)
else:
multiExp_dispatch_no_endo_vartime(r, elems, expos, len)

func multiExp_vartime*[bits: static int, GT](
r: var GT,
elems: openArray[GT],
expos: openArray[BigInt[bits]]) {.tags:[VarTime, Alloca, HeapAlloc], meter, inline.} =
expos: openArray[BigInt[bits]],
useEndo: static bool = true,
useTorus: static bool = false) {.tags:[VarTime, Alloca, HeapAlloc], meter, inline.} =
## Multiexponentiation:
## r <- g₀^a₀ + g₁^a₁ + ... + gₙ^aₙ
debug: doAssert elems.len == expos.len
let N = elems.len
multiExp_dispatch_vartime(r, elems.asUnchecked(), expos.asUnchecked(), N)
multiExp_vartime(r, elems.asUnchecked(), expos.asUnchecked(), N, useEndo, useTorus)

func multiExp_vartime*[F, GT](
r: var GT,
elems: ptr UncheckedArray[GT],
expos: ptr UncheckedArray[F],
len: int) {.tags:[VarTime, Alloca, HeapAlloc], meter.} =
len: int,
useEndo: static bool = true,
useTorus: static bool = false) {.tags:[VarTime, Alloca, HeapAlloc], meter.} =
## Multiexponentiation:
## r <- g₀^a₀ + g₁^a₁ + ... + gₙ^aₙ
let n = cast[int](len)
let expos_big = allocHeapArrayAligned(F.getBigInt(), n, alignment = 64)
expos_big.batchFromField(expos, n)
r.multiExp_vartime(elems, expos_big, n)
r.multiExp_vartime(elems, expos_big, n, useEndo, useTorus)

freeHeapAligned(expos_big)

func multiExp_vartime*[GT](
r: var GT,
elems: openArray[GT],
expos: openArray[Fr]) {.tags:[VarTime, Alloca, HeapAlloc], inline.} =
expos: openArray[Fr],
useEndo: static bool = true,
useTorus: static bool = true) {.tags:[VarTime, Alloca, HeapAlloc], inline.} =
## Multiexponentiation:
## r <- g₀^a₀ + g₁^a₁ + ... + gₙ^aₙ
debug: doAssert elems.len == expos.len
let N = elems.len
multiExp_vartime(r, elems.asUnchecked(), expos.asUnchecked(), N)
multiExp_vartime(r, elems.asUnchecked(), expos.asUnchecked(), N, useEndo, useTorus)
4 changes: 2 additions & 2 deletions constantine/math/pairings/gt_prj.nim
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,7 @@ proc fromTorus2_vartime*[F](r: var QuadraticExt[F], a: T2Aff[F]) =

# Special case identity element
if bool a.isNeutral():
r.setOne()
r.setNeutral()
return

var num {.noInit.}, den {.noInit.}: typeof(r)
Expand Down Expand Up @@ -380,7 +380,7 @@ proc mixedProd_vartime*[F](r: var T2Prj[F], a: T2Prj[F], b: T2Aff[F]) =
r = a
return

var u0 {.noInit.}, u1 {.noInit.}, t{.noInit.}: F
var u0 {.noInit.}, u1 {.noInit.}, t {.noInit.}: F
u0.prod(a.x, F b)
u1.prod(a.z, F b)
t.prod(a.z, NonResidue)
Expand Down
6 changes: 4 additions & 2 deletions tests/math_pairings/t_pairing_template.nim
Original file line number Diff line number Diff line change
Expand Up @@ -241,14 +241,16 @@ proc runGTmultiexpTests*[N: static int](GT: typedesc, num_points: array[N, int],
t.gtExp_vartime(elems[i], exponents[i])
naive *= t

var mexp_ref, mexp_ref_torus, mexp_opt: GT
var mexp_ref, mexp_ref_torus, mexp_opt, mexp_opt_torus: GT
mexp_ref.multiExp_reference_vartime(elems, exponents, useTorus = false)
mexp_ref_torus.multiExp_reference_vartime(elems, exponents, useTorus = true)
mexp_opt.multiExp_vartime(elems, exponents)
mexp_opt.multiExp_vartime(elems, exponents, useEndo = true, useTorus = false)
mexp_opt_torus.multiExp_vartime(elems, exponents, useEndo = false, useTorus = true)

doAssert bool(naive == mexp_ref)
doAssert bool(naive == mexp_ref_torus)
doAssert bool(naive == mexp_opt)
doAssert bool(naive == mexp_opt_torus)

stdout.write '.'

Expand Down

0 comments on commit 55c5d38

Please sign in to comment.