kimchi/circuits/polynomials/endosclmul.rs
1//! This module implements the **EndoMul** gate for short Weierstrass curve
2//! endomorphism-optimized variable base scalar multiplication.
3//!
4//! # Purpose
5//!
6//! Compute `[scalar] * base_point` where the scalar is given as bits and the
7//! base point is a curve point. This is the core operation for EC-based
8//! cryptography in-circuit.
9//!
10//! # Notation
11//!
12//! - `T`: The fixed base point for the full scalar multiplication
13//! - `P`: The running accumulator point (changes row by row)
14//!
15//! # Inputs (per row)
16//!
17//! - `(x_T, y_T)`: Base point T being multiplied (columns 0, 1)
18//! - `(x_P, y_P)`: Current accumulator point P (columns 4, 5)
19//! - `n`: Current accumulated scalar value (column 6)
20//! - `b1, b2, b3, b4`: Four scalar bits for this row (columns 11-14)
21//!
22//! # Outputs (in next row)
23//!
24//! - `(x_S, y_S)`: Updated accumulator point after processing 4 bits (cols 4,5)
25//! - `n'`: Updated accumulated scalar: `n' = 16*n + 8*b1 + 4*b2 + 2*b3 + b4`
26//!
27//! # Endomorphism-optimized scalar multiplication
28//!
29//! For curves of the form y^2 = x^3 + b (like Pallas and Vesta), there exists
30//! an efficient endomorphism phi defined by:
31//!
32//! phi(x, y) = (endo * x, y)
33//!
34//! where `endo` (also called xi) is a primitive cube root of unity in the base
35//! field. This works because (endo * x)^3 = endo^3 * x^3 = x^3, so the point
36//! remains on the curve.
37//!
38//! This endomorphism corresponds to scalar multiplication by lambda:
39//!
40//! phi(T) = [lambda]T
41//!
42//! where lambda is a primitive cube root of unity in the scalar field.
43//!
44//! ## How the optimization works
45//!
46//! The key insight is that we can compute `P + phi(T)` or `P - phi(T)` almost
47//! as cheaply as `P + T` or `P - T`, because applying phi only requires
48//! multiplying the x-coordinate by `endo`.
49//!
50//! For a 2-bit window (b1, b2), we can encode 4 different point operations:
51//!
52//! | b1 | b2 | Point Q added to accumulator P |
53//! |----|----|--------------------|
54//! | 0 | 0 | -T |
55//! | 0 | 1 | T |
56//! | 1 | 0 | -phi(T) |
57//! | 1 | 1 | phi(T) |
58//!
59//! This is achieved by:
60//! - `xq = (1 + (endo - 1) * b1) * x_T` = x_T if b1=0, or endo * x_T if b1=1
61//! - `yq = (2 * b2 - 1) * y_T` = -y_T if b2=0, or y_T if b2=1
62//!
63//! So (xq, yq) represents one of {T, -T, phi(T), -phi(T)} based on (b1, b2).
64//!
65//! ## Why phi(T)? The GLV optimization
66//!
67//! When we want to compute `[k]T` for a large scalar `k`, a standard
68//! variable-base method uses roughly one double-and-add update per scalar bit
69//! (~256 updates for a 256-bit scalar). The GLV method
70//! (Gallant-Lambert-Vanstone) cuts this roughly in half.
71//!
72//! The key insight is that any scalar k can be decomposed as:
73//!
74//! k = k1 + k2 * lambda (mod r)
75//!
76//! where k1, k2 are roughly half the bit-length of k (about 128 bits each).
77//! Since `phi(T) = [lambda]T`, we can rewrite:
78//!
79//! [k]T = [k1]T + [k2][lambda]T = [k1]T + [k2]phi(T)
80//!
81//! Now instead of one 256-bit scalar multiplication, we have two 128-bit scalar
82//! multiplications: `[k1]T` and `[k2]phi(T)`. But we can do even better by
83//! computing both **simultaneously** using a multi-scalar multiplication
84//! approach.
85//!
86//! In each step, we process one bit from k1 and one bit from k2 together. The
87//! 2-bit encoding (b1, b2) selects which combination of T and phi(T) to add:
88//!
89//! - b1 selects between T (b1=0) and phi(T) (b1=1)
90//! - b2 selects the sign: negative (b2=0) or positive (b2=1)
91//!
92//! | b1 | b2 | Point added |
93//! |----|----|-------------|
94//! | 0 | 0 | -T |
95//! | 0 | 1 | T |
96//! | 1 | 0 | -phi(T) |
97//! | 1 | 1 | phi(T) |
98//!
99//! The negative points come from the y-coordinate formula: `yq = (2*b2 - 1)*y_T`.
100//! When b2=0, we get `-y_T`, which negates the point (since `-P = (x, -y)` on
101//! elliptic curves). We need both positive and negative points to encode the
102//! scalar using a **signed digit representation**. With 2 bits we represent 4
103//! distinct values `{-1, +1} x {T, phi(T)}`, which is more expressive than just
104//! `{0, 1} x {T, phi(T)}`. This signed representation is part of what makes the
105//! GLV method efficient - it allows the scalar decomposition to use both
106//! positive and negative contributions.
107//!
108//! This interleaves the bits of k1 and k2, processing one bit of each per
109//! accumulator update. Since k1 and k2 are ~128 bits, we need only ~128 updates
110//! instead of ~256, halving the circuit size.
111//!
112//! The gate processes 4 bits per row (two consecutive accumulator updates),
113//! so a 128-bit scalar requires 32 rows of EndoMul gates.
114//!
115//! ## Protocol fit
116//!
117//! In Kimchi/Snarky terminology, this gate enforces the **EC_endoscale**
118//! point-update constraint (the point side of endomorphism-optimized scaling
119//! rounds). In Pickles terms, this corresponds to endomorphism-optimized
120//! point updates used with scalar challenges.
121//!
122//! This gate is used to implement recursive verifier IPA/bulletproof
123//! point-folding logic efficiently (GLV endomorphism optimization), including
124//! repeated accumulator updates of the form `A <- (A + Q) + A`.
125//!
126//! Typical protocol usage:
127//!
128//! - **Wrap proofs**: used as part of normal wrap-circuit verification of step
129//! proofs (part of the wrap recursion flow).
130//! - **Step proofs (recursive setting)**: used when the step circuit verifies
131//! previous proofs (e.g. `max_proofs_verified > 0`).
132//! - **Non-recursive step circuits**: not inherently required; if no recursive
133//! verification gadget is instantiated, this gate need not be active.
134//!
135//! ## Usage
136//!
137//! To compute `[scalar] * base` for a 128-bit scalar:
138//!
139//! 1. **Set up gates**: Create 32 consecutive EndoMul gates (128 bits / 4 bits
140//! per row), followed by one Zero gate. The Zero gate is required because
141//! each EndoMul gate reads the accumulator from the next row.
142//!
143//! 2. **Compute initial accumulator**: To avoid point-at-infinity edge cases,
144//! initialize the accumulator as `acc0 = 2 * (T + phi(T))` where T is the
145//! base point and phi is the endomorphism.
146//!
147//! 3. **Prepare scalar bits**: Convert the scalar to bits in **MSB-first**
148//! order (most significant bit at index 0).
149//!
150//! 4. **Generate witness**: Call `gen_witness` with the witness array, starting
151//! row, endo coefficient, base point coordinates, MSB-first bits, and
152//! initial accumulator. The function returns the final accumulated point
153//! and the reconstructed scalar value.
154//!
155//! See `kimchi/src/tests/endomul.rs` for a complete example.
156//!
157//! ## Invariants
158//!
159//! The following invariants **must** be respected:
160//!
161//! 1. **Bit count**: `bits.len()` must be a multiple of 4.
162//!
163//! 2. **Bit order**: Bits must be in **MSB-first** order (most significant bit
164//! at index 0).
165//!
166//! 3. **Gate chain**: For `n` bits, you need `n/4` consecutive EndoMul gates,
167//! followed by a Zero gate (or any gate that doesn't constrain the EndoMul
168//! output columns). The Zero gate is needed because EndoMul reads from the
169//! next row.
170//!
171//! 4. **Initial accumulator**: `acc0` must not be the point at infinity. The
172//! standard initialization is `acc0 = 2 * (T + phi(T))` where T is the base
173//! point. This ensures the accumulator never hits the point at infinity
174//! during computation.
175//!
176//! 5. **Endo coefficient**: The `endo` parameter must be the correct cube root
177//! of unity for the curve, obtained via `endos::<Curve>()`.
178//!
179//! 6. **Base point consistency**: The base point `(x_T, y_T)` must be the same
180//! across all rows of a single scalar multiplication.
181//!
182//! 7. **Scalar value verification**: The EndoMul gate only constrains the
183//! row-to-row relationship `n' = 16*n + 8*b1 + 4*b2 + 2*b3 + b4`. It does
184//! **not** constrain the initial or final value of `n`. The calling circuit
185//! must add external constraints.
186//! To enforce:
187//! - Initial `n = 0` at the first EndoMul row
188//! - Final `n = k` where `k` is the expected scalar value
189//!
190//! ## References
191//!
192//! - Halo paper, Section 6.2: <https://eprint.iacr.org/2019/1021>
193//! - GLV method: <https://www.iacr.org/archive/crypto2001/21390189.pdf>
194
195use crate::{
196 circuits::{
197 argument::{Argument, ArgumentEnv, ArgumentType},
198 berkeley_columns::{BerkeleyChallengeTerm, BerkeleyChallenges},
199 constraints::ConstraintSystem,
200 expr::{
201 self,
202 constraints::{boolean, ExprOps},
203 Cache,
204 },
205 gate::{CircuitGate, GateType},
206 wires::{GateWires, COLUMNS},
207 },
208 curve::KimchiCurve,
209 proof::{PointEvaluations, ProofEvaluations},
210};
211use ark_ff::{Field, PrimeField};
212use core::marker::PhantomData;
213
214//~ We implement custom gate constraints for short Weierstrass curve
215//~ endomorphism optimized variable base scalar multiplication.
216//~
217//~ Given a finite field $\mathbb{F}_{q}$ of order $q$, if the order is not a
218//~ multiple of 2 nor 3, then an
219//~ elliptic curve over $\mathbb{F}_{q}$ in short Weierstrass form is
220//~ represented by the set of points $(x,y)$ that satisfy the following
221//~ equation with $a,b\in\mathbb{F}_{q}$ and $4a^3+27b^2\neq_{\mathbb{F}_q} 0$:
222//~ $$E(\mathbb{F}_q): y^2 = x^3 + a x + b$$
223//~ If $P=(x_p, y_p)$ and $T=(x_t, y_t)$ are two points in the curve
224//~ $E(\mathbb{F}_q)$, the goal of this operation is to compute
225//~ $S = (P + Q) + P$ where $Q \in \{T, -T, \phi(T), -\phi(T)\}$ is determined
226//~ by bits $(b_1, b_2)$. Here $\phi$ is the curve endomorphism
227//~ $\phi(x,y) = (\mathtt{endo} \cdot x, y)$.
228//~
229//~ The bits encode the point $Q$ as follows:
230//~ * $b_1 = 0$: use $T$, i.e., $x_q = x_t$
231//~ * $b_1 = 1$: use $\phi(T)$, i.e., $x_q = \mathtt{endo} \cdot x_t$
232//~ * $b_2 = 0$: negate, i.e., $y_q = -y_t$
233//~ * $b_2 = 1$: keep sign, i.e., $y_q = y_t$
234//~
235//~ This technique allows processing 2 bits of the scalar per point operation.
236//~ Since each row performs two such operations (using bits $b_1, b_2$ and then
237//~ $b_3, b_4$), we process 4 bits per row.
238//~
239//~ In particular, the constraints of this gate take care of 4 bits of the
240//~ scalar within a single EVBSM row. When the scalar is longer (which will
241//~ usually be the case), multiple EVBSM rows will be concatenated.
242//~
243//~ | Row | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | Type |
244//~ |-----|----|----|------|---|----|----|----|-----|-----|-----|-----|-----|-----|-----|-----|-------|
245//~ | i | xT | yT | inv | Ø | xP | yP | n | xR | yR | s1 | s3 | b1 | b2 | b3 | b4 | EVBSM |
246//~ | i+1 | = | = | inv' | | xS | yS | n' | xR' | yR' | s1' | s3' | b1' | b2' | b3' | b4' | EVBSM |
247//~
248//~ The gate performs two accumulator updates per row, each of the form
249//~ `A <- (A + Q) + A = 2A + Q`.
250//~
251//~ - First, bits `(b1, b2)` select `Q1` in `{T, -T, \phi(T), -\phi(T)}`, and the
252//~ stored point `R = (xR, yR)` is the output of the first update: `R = (P + Q1) + P`.
253//~ - Second, bits `(b3, b4)` select `Q2` in the same set, and the stored point
254//~ `S = (xS, yS)` is the output of the second update: `S = (R + Q2) + R`.
255//~
256//~ The intermediate sums `P + Q1` and `R + Q2` are not stored as witness
257//~ columns. On the next row, `(xS, yS)` becomes the new `(xP, yP)`, and
258//~ `(xR', yR')` is the next row's first-update output.
259//~
260//~ The variables (`xT`, `yT`), (`xP`, `yP`), (`xR`, `yR`), and (`xS`, `yS`)
261//~ are the corresponding affine coordinates of points `T`, `P`, `R`, and `S`.
262//~
263//~ `n` and `n'` are accumulated scalar prefixes in MSB-first order, where `n'`
264//~ extends `n` with the next 4-bit chunk encoded by `b1..b4` with `n ≤ n'``.
265//~ `s1` and `s3` are intermediary values used to compute the slopes from the
266//~ curve addition formula.
267//~
268//~ The layout of this gate (and the next row) allows for this chained behavior where the output point
269//~ of the current row $S$ gets accumulated as one of the inputs of the following row, becoming $P$ in
270//~ the next constraints. Similarly, the scalar is decomposed into binary form and $n$ ($n'$ respectively)
271//~ will store the current accumulated value and the next one for the check.
272//~
273//~ For readability, we define the following variables for the constraints:
274//~
275//~ * `endo` $:=$ `EndoCoefficient`
276//~ * `xq1` $:= (1 + ($`endo`$ - 1)\cdot b_1) \cdot x_t$
277//~ * `xq2` $:= (1 + ($`endo`$ - 1)\cdot b_3) \cdot x_t$
278//~ * `yq1` $:= (2\cdot b_2 - 1) \cdot y_t$
279//~ * `yq2` $:= (2\cdot b_4 - 1) \cdot y_t$
280//~
281//~ Note: each row is performing two additions, so we use two selected points:
282//~ `Q1 := (xq1, yq1)` from bits `(b1, b2)` and `Q2 := (xq2, yq2)` from bits
283//~ `(b3, b4)`. They are points, and each is selected from
284//~ `Q:={T, -T, \phi(T), -\phi(T)}` by its corresponding bit pair. That means:
285//~
286//~ Selection table for the first selected point `Q1`:
287//~
288//~ | b1 | b2 | Q1 | (xq1, yq1) |
289//~ |----|----|----------|--------------------------|
290//~ | 0 | 0 | -T | (x_t, -y_t) |
291//~ | 0 | 1 | T | (x_t, y_t) |
292//~ | 1 | 0 | -\phi(T) | (`endo` \cdot x_t, -y_t) |
293//~ | 1 | 1 | \phi(T) | (`endo` \cdot x_t, y_t) |
294//~
295//~ Selection table for the second selected point `Q2`:
296//~
297//~ | b3 | b4 | Q2 | (xq2, yq2) |
298//~ |----|----|----------|--------------------------|
299//~ | 0 | 0 | -T | (x_t, -y_t) |
300//~ | 0 | 1 | T | (x_t, y_t) |
301//~ | 1 | 0 | -\phi(T) | (`endo` \cdot x_t, -y_t) |
302//~ | 1 | 1 | \phi(T) | (`endo` \cdot x_t, y_t) |
303//~
304//~ These are the 12 constraints that correspond to each EVBSM gate,
305//~ which take care of 4 bits of the scalar within a single EVBSM row:
306//~
307//~ * First block:
308//~ * `(xq1 - xp) * s1 = yq1 - yp`
309//~ * `(2*xp - s1^2 + xq1) * ((xp - xr)*s1 + yr + yp) = (xp - xr) * 2*yp`
310//~ * `(yr + yp)^2 = (xp – xr)^2 * (s1^2 – xq1 + xr)`
311//~ * Second block:
312//~ * `(xq2 - xr) * s3 = yq2 - yr`
313//~ * `(2*xr - s3^2 + xq2) * ((xr - xs)*s3 + ys + yr) = (xr - xs) * 2*yr`
314//~ * `(ys + yr)^2 = (xr – xs)^2 * (s3^2 – xq2 + xs)`
315//~ * Booleanity checks:
316//~ * Bit flag $b_1$: `0 = b1 * (b1 - 1)`
317//~ * Bit flag $b_2$: `0 = b2 * (b2 - 1)`
318//~ * Bit flag $b_3$: `0 = b3 * (b3 - 1)`
319//~ * Bit flag $b_4$: `0 = b4 * (b4 - 1)`
320//~ * Binary decomposition:
321//~ * Accumulated scalar: `n' = 16 * n + 8 * b1 + 4 * b2 + 2 * b3 + b4`
322//~ * Distinct point checks:
323//~ * `(xp - xr) * (xr - xs) * inv = 1`
324//~ - Note: if `xp = xr` (equiv `xr = xs`) then we see `(yr + yp)^2 = 0`
325//~ from constraint 3, and so we are necessarily in the disallowed
326//~ degenerate case `P=-R` (`xp = xr` and `yr = -yp`).
327//~
328//~ Note: in the EC derivation below, `R` and `S` are local symbols inside each
329//~ block's addition formulas. The witness columns still follow the row layout
330//~ above (`xP, yP` as input, `xR, yR` after the first update, `xS, yS` after
331//~ the second update).
332//~
333//~ The constraints above are derived from the following EC Affine arithmetic
334//~ equations.
335//~
336//~ **Background on EC point addition/doubling:**
337//~
338//~ For points P = (x_p, y_p) and Q = (x_q, y_q) on a short Weierstrass curve,
339//~ the sum R = P + Q = (x_r, y_r) is computed as:
340//~
341//~ * Slope: $s = (y_q - y_p) / (x_q - x_p)$
342//~ * $x_r = s^2 - x_p - x_q$
343//~ * $y_r = s \cdot (x_p - x_r) - y_p$
344//~
345//~ For point doubling R = 2P:
346//~
347//~ * Slope: $s = (3 x_p^2 + a) / (2 y_p)$ (where a=0 for our curves)
348//~ * $x_r = s^2 - 2 \cdot x_p$
349//~ * $y_r = s \cdot (x_p - x_r) - y_p$
350//~
351//~ **Derivation of the constraints:**
352//~
353//~ Each "block" computes S = (P + Q) + P where Q = (xq, yq) is determined by
354//~ bits. The intermediate point R = P + Q and final point S = R + P.
355//~
356//~ We use two slopes:
357//~ * $s_1$: slope for P + Q -> R
358//~ * $s_2$: slope for R + P -> S
359//~
360//~ The key optimization is eliminating $s_2$ from the constraints by
361//~ substituting:
362//~
363//~ * (1) => $(x_{q_1} - x_p) \cdot s_1 = y_{q_1} - y_p$
364//~ * (2&3) => $(x_p – x_r) \cdot s_2 = y_r + y_p$
365//~ * (2) => $(2 \cdot x_p + x_{q_1} – s_1^2) \cdot (s_1 + s_2) = 2 \cdot y_p$
366//~ * <=> $(2 x_p - s_1^2 + x_{q_1})((x_p - x_r) s_1 + y_r + y_p)$
367//~ $= (x_p - x_r) \cdot 2 y_p$
368//~ * (3) => $s_1^2 - s_2^2 = x_{q_1} - x_r$
369//~ * <=> $(y_r + y_p)^2 = (x_p – x_r)^2 \cdot (s_1^2 – x_{q_1} + x_r)$
370//~ *
371//~ * (4) => $(x_{q_2} - x_r) \cdot s_3 = y_{q_2} - y_r$
372//~ * (5&6) => $(x_r – x_s) \cdot s_4 = y_s + y_r$
373//~ * (5) => $(2 \cdot x_r + x_{q_2} – s_3^2) \cdot (s_3 + s_4) = 2 \cdot y_r$
374//~ * <=> $(2 x_r - s_3^2 + x_{q_2})((x_r - x_s) s_3 + y_s + y_r)$
375//~ $= (x_r - x_s) \cdot 2 y_r$
376//~ * (6) => $s_3^2 – s_4^2 = x_{q_2} - x_s$
377//~ * <=> $(y_s + y_r)^2 = (x_r – x_s)^2 \cdot (s_3^2 – x_{q_2} + x_s)$
378//~
379//~ Defining $s_2$ and $s_4$ as
380//~
381//~ * $s_2 := \frac{2 \cdot y_P}{2 * x_P + x_{q_1} - s_1^2} - s_1$
382//~ * $s_4 := \frac{2 \cdot y_R}{2 * x_R + x_{q_2} - s_3^2} - s_3$
383//~
384//~ Gives the following equations when substituting $s_2$ and $s_4$:
385//~
386//~ 1. `(xq1 - xp) * s1 = yq1 - yp` (i.e., `(2 * b2 - 1) * yt - yp`)
387//~ 2. `(2*xp - s1^2 + xq1) * ((xp - xr)*s1 + yr + yp) = (xp - xr) * 2*yp`
388//~ 3. `(yr + yp)^2 = (xp – xr)^2 * (s1^2 – xq1 + xr)`
389//~
390//~ 4. `(xq2 - xr) * s3 = yq2 - yr` (i.e., `(2 * b4 - 1) * yt - yr`)
391//~ 5. `(2*xr - s3^2 + xq2) * ((xr - xs)*s3 + ys + yr) = (xr - xs) * 2*yr`
392//~ 6. `(ys + yr)^2 = (xr – xs)^2 * (s3^2 – xq2 + xs)`
393//~
394
395/// Implementation of group endomorphism optimized
396/// variable base scalar multiplication custom Plonk constraints.
397impl<F: PrimeField> CircuitGate<F> {
398 pub fn create_endomul(wires: GateWires) -> Self {
399 CircuitGate::new(GateType::EndoMul, wires, vec![])
400 }
401
402 /// Verify the `EndoMul` gate.
403 ///
404 /// # Errors
405 ///
406 /// Will give error if `self.typ` is not `GateType::EndoMul`, or if
407 /// constraint evaluation fails.
408 pub fn verify_endomul<
409 const FULL_ROUNDS: usize,
410 G: KimchiCurve<FULL_ROUNDS, ScalarField = F>,
411 >(
412 &self,
413 row: usize,
414 witness: &[Vec<F>; COLUMNS],
415 cs: &ConstraintSystem<F>,
416 ) -> Result<(), String> {
417 ensure_eq!(self.typ, GateType::EndoMul, "incorrect gate type");
418
419 let this: [F; COLUMNS] = core::array::from_fn(|i| witness[i][row]);
420 let next: [F; COLUMNS] = core::array::from_fn(|i| witness[i][row + 1]);
421
422 let pt = F::from(123456u64);
423
424 let constants = expr::Constants {
425 mds: &G::sponge_params().mds,
426 endo_coefficient: cs.endo,
427 zk_rows: cs.zk_rows,
428 };
429 let challenges = BerkeleyChallenges {
430 alpha: F::zero(),
431 beta: F::zero(),
432 gamma: F::zero(),
433 joint_combiner: F::zero(),
434 };
435
436 let evals: ProofEvaluations<PointEvaluations<G::ScalarField>> =
437 ProofEvaluations::dummy_with_witness_evaluations(this, next);
438
439 let constraints = EndosclMul::constraints(&mut Cache::default());
440 for (i, c) in constraints.iter().enumerate() {
441 match c.evaluate_(cs.domain.d1, pt, &evals, &constants, &challenges) {
442 Ok(x) => {
443 if x != F::zero() {
444 return Err(format!("Bad endo equation {i}"));
445 }
446 }
447 Err(e) => return Err(format!("evaluation failed: {e}")),
448 }
449 }
450
451 Ok(())
452 }
453
454 pub fn endomul(&self) -> F {
455 if self.typ == GateType::EndoMul {
456 F::one()
457 } else {
458 F::zero()
459 }
460 }
461}
462
463/// Implementation of the `EndosclMul` gate.
464#[derive(Default)]
465pub struct EndosclMul<F>(PhantomData<F>);
466
467impl<F> Argument<F> for EndosclMul<F>
468where
469 F: PrimeField,
470{
471 const ARGUMENT_TYPE: ArgumentType = ArgumentType::Gate(GateType::EndoMul);
472 const CONSTRAINTS: u32 = 12;
473
474 fn constraint_checks<T: ExprOps<F, BerkeleyChallengeTerm>>(
475 env: &ArgumentEnv<F, T>,
476 cache: &mut Cache,
477 ) -> Vec<T> {
478 let b1 = env.witness_curr(11);
479 let b2 = env.witness_curr(12);
480 let b3 = env.witness_curr(13);
481 let b4 = env.witness_curr(14);
482
483 let xt = env.witness_curr(0);
484 let yt = env.witness_curr(1);
485
486 let inv = env.witness_curr(2);
487
488 let xs = env.witness_next(4);
489 let ys = env.witness_next(5);
490
491 let xp = env.witness_curr(4);
492 let yp = env.witness_curr(5);
493
494 let xr = env.witness_curr(7);
495 let yr = env.witness_curr(8);
496
497 let s1 = env.witness_curr(9);
498 let s3 = env.witness_curr(10);
499
500 let endo_minus_1 = env.endo_coefficient() - T::one();
501 let xq1 = cache.cache((T::one() + b1.clone() * endo_minus_1.clone()) * xt.clone());
502 let xq2 = cache.cache((T::one() + b3.clone() * endo_minus_1) * xt);
503
504 let yq1 = (b2.double() - T::one()) * yt.clone();
505 let yq2 = (b4.double() - T::one()) * yt;
506
507 let s1_squared = cache.cache(s1.square());
508 let s3_squared = cache.cache(s3.square());
509
510 // n_next = 16*n + 8*b1 + 4*b2 + 2*b3 + b4
511 let n = env.witness_curr(6);
512 let n_next = env.witness_next(6);
513 let n_constraint =
514 (((n.double() + b1.clone()).double() + b2.clone()).double() + b3.clone()).double()
515 + b4.clone()
516 - n_next;
517
518 let xp_xr = cache.cache(xp.clone() - xr.clone());
519 let xr_xs = cache.cache(xr.clone() - xs.clone());
520
521 let ys_yr = cache.cache(ys + yr.clone());
522 let yr_yp = cache.cache(yr.clone() + yp.clone());
523
524 vec![
525 // verify booleanity of the scalar bits
526 boolean(&b1),
527 boolean(&b2),
528 boolean(&b3),
529 boolean(&b4),
530 // (xq1 - xp) * s1 = yq1 - yp
531 ((xq1.clone() - xp.clone()) * s1.clone()) - (yq1 - yp.clone()),
532 // (2*xp – s1^2 + xq1) * ((xp - xr) * s1 + yr + yp) = (xp - xr) * 2*yp
533 (((xp.double() - s1_squared.clone()) + xq1.clone())
534 * ((xp_xr.clone() * s1) + yr_yp.clone()))
535 - (yp.double() * xp_xr.clone()),
536 // (yr + yp)^2 = (xp – xr)^2 * (s1^2 – xq1 + xr)
537 yr_yp.square() - (xp_xr.clone().square() * ((s1_squared - xq1) + xr.clone())),
538 // (xq2 - xr) * s3 = yq2 - yr
539 ((xq2.clone() - xr.clone()) * s3.clone()) - (yq2 - yr.clone()),
540 // (2*xr – s3^2 + xq2) * ((xr – xs) * s3 + ys + yr) = (xr - xs) * 2*yr
541 (((xr.double() - s3_squared.clone()) + xq2.clone())
542 * ((xr_xs.clone() * s3) + ys_yr.clone()))
543 - (yr.double() * xr_xs.clone()),
544 // (ys + yr)^2 = (xr – xs)^2 * (s3^2 – xq2 + xs)
545 ys_yr.square() - (xr_xs.clone().square() * ((s3_squared - xq2) + xs)),
546 n_constraint,
547 // (xp - xr) * (xr - xs) * inv = 1
548 xp_xr * xr_xs * inv - T::one(),
549 ]
550 }
551}
552
553/// The result of performing an endomorphism-optimized scalar multiplication.
554///
555/// After processing all scalar bits through the EndoMul gates, this struct
556/// holds:
557/// - The final accumulated curve point (as affine coordinates)
558/// - The reconstructed scalar value from the processed bits
559pub struct EndoMulResult<F> {
560 /// The final accumulated point (x, y) after all scalar multiplication
561 /// steps.
562 /// This equals `[scalar]T` where `T` is the base point and `scalar` is
563 /// derived from the input bits combined with the endomorphism.
564 pub acc: (F, F),
565 /// The accumulated scalar value reconstructed from all processed bits.
566 /// For a 128-bit scalar processed in 32 rows (4 bits/row), this equals
567 /// the original scalar k such that `acc = [k]T` (with endomorphism
568 /// applied).
569 pub n: F,
570}
571
572/// Generates the witness values for a series of EndoMul gates.
573///
574/// This function computes the witness for endomorphism-optimized scalar
575/// multiplication. It processes 4 bits of the scalar per row, computing
576/// the intermediate curve points and slopes needed for the constraints.
577///
578/// # Arguments
579///
580/// * `w` - The witness array to populate (15 columns x num_rows)
581/// * `row0` - The starting row index
582/// * `endo` - The endomorphism coefficient (cube root of unity in base field)
583/// * `base` - The base point T = (x_T, y_T) being multiplied
584/// * `bits` - Scalar bits in MSB-first order. Length must be a multiple of 4.
585/// * `acc0` - Initial accumulator point. Typically set to `2*(T + phi(T))` to
586/// avoid edge cases with the point at infinity.
587///
588/// # Returns
589///
590/// The final accumulated point and scalar after processing all bits.
591///
592/// # Wire Layout (per row)
593///
594/// | Col | 0 | 1 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 |
595/// |-----|-----|-----|-----|-----|-----|-----|-----|-----|-----|-----|-----|-----|-----|
596/// | | x_T | y_T | x_P | y_P | n | x_R | y_R | s1 | s3 | b1 | b2 | b3 | b4 |
597///
598/// # Panics
599///
600/// Will panic if `bits` length is not a multiple of 4.
601pub fn gen_witness<F: Field + core::fmt::Display>(
602 w: &mut [Vec<F>; COLUMNS],
603 row0: usize,
604 endo: F,
605 base: (F, F),
606 bits: &[bool],
607 acc0: (F, F),
608) -> EndoMulResult<F> {
609 let bits_per_row = 4;
610 let rows = bits.len() / 4;
611 assert_eq!(0, bits.len() % 4);
612
613 let bits: Vec<_> = bits.iter().map(|x| F::from(u64::from(*x))).collect();
614 let one = F::one();
615
616 let mut acc = acc0;
617 let mut n_acc = F::zero();
618
619 // TODO: Could be more efficient
620 for i in 0..rows {
621 let b1 = bits[i * bits_per_row];
622 let b2 = bits[i * bits_per_row + 1];
623 let b3 = bits[i * bits_per_row + 2];
624 let b4 = bits[i * bits_per_row + 3];
625
626 let (xt, yt) = base;
627 let (xp, yp) = acc;
628
629 let xq1 = (one + (endo - one) * b1) * xt;
630 let yq1 = (b2.double() - one) * yt;
631
632 let s1 = (yq1 - yp) / (xq1 - xp);
633 let s1_squared = s1.square();
634 // (2*xp – s1^2 + xq) * ((xp – xr) * s1 + yr + yp) = (xp – xr) * 2*yp
635 // => 2 yp / (2*xp – s1^2 + xq) = s1 + (yr + yp) / (xp – xr)
636 // => 2 yp / (2*xp – s1^2 + xq) - s1 = (yr + yp) / (xp – xr)
637 //
638 // s2 := 2 yp / (2*xp – s1^2 + xq) - s1
639 //
640 // (yr + yp)^2 = (xp – xr)^2 * (s1^2 – xq1 + xr)
641 // => (s1^2 – xq1 + xr) = (yr + yp)^2 / (xp – xr)^2
642 //
643 // => xr = s2^2 - s1^2 + xq
644 // => yr = s2 * (xp - xr) - yp
645 let s2 = yp.double() / (xp.double() + xq1 - s1_squared) - s1;
646
647 // (xr, yr)
648 let xr = xq1 + s2.square() - s1_squared;
649 let xp_xr = xp - xr;
650 let yr = xp_xr * s2 - yp;
651
652 let xq2 = (one + (endo - one) * b3) * xt;
653 let yq2 = (b4.double() - one) * yt;
654 let s3 = (yq2 - yr) / (xq2 - xr);
655 let s3_squared = s3.square();
656 let s4 = yr.double() / (xr.double() + xq2 - s3_squared) - s3;
657
658 let xs = xq2 + s4.square() - s3_squared;
659 let xr_xs = xr - xs;
660 let ys = xr_xs * s4 - yr;
661
662 let inv = (xp_xr * xr_xs)
663 .inverse()
664 .expect("xr to be distinct from xp and xs");
665
666 let row = i + row0;
667
668 w[0][row] = base.0;
669 w[1][row] = base.1;
670 w[2][row] = inv;
671 w[4][row] = xp;
672 w[5][row] = yp;
673 w[6][row] = n_acc;
674 w[7][row] = xr;
675 w[8][row] = yr;
676 w[9][row] = s1;
677 w[10][row] = s3;
678 w[11][row] = b1;
679 w[12][row] = b2;
680 w[13][row] = b3;
681 w[14][row] = b4;
682
683 acc = (xs, ys);
684
685 n_acc.double_in_place();
686 n_acc += b1;
687 n_acc.double_in_place();
688 n_acc += b2;
689 n_acc.double_in_place();
690 n_acc += b3;
691 n_acc.double_in_place();
692 n_acc += b4;
693 }
694 w[4][row0 + rows] = acc.0;
695 w[5][row0 + rows] = acc.1;
696 w[6][row0 + rows] = n_acc;
697
698 EndoMulResult { acc, n: n_acc }
699}