Skip to main content

kimchi/circuits/polynomials/
endosclmul.rs

1//! This module implements the **EndoMul** gate for short Weierstrass curve
2//! endomorphism-optimized variable base scalar multiplication.
3//!
4//! # Purpose
5//!
6//! Compute `[scalar] * base_point` where the scalar is given as bits and the
7//! base point is a curve point. This is the core operation for EC-based
8//! cryptography in-circuit.
9//!
10//! # Notation
11//!
12//! - `T`: The fixed base point for the full scalar multiplication
13//! - `P`: The running accumulator point (changes row by row)
14//!
15//! # Inputs (per row)
16//!
17//! - `(x_T, y_T)`: Base point T being multiplied (columns 0, 1)
18//! - `(x_P, y_P)`: Current accumulator point P (columns 4, 5)
19//! - `n`: Current accumulated scalar value (column 6)
20//! - `b1, b2, b3, b4`: Four scalar bits for this row (columns 11-14)
21//!
22//! # Outputs (in next row)
23//!
24//! - `(x_S, y_S)`: Updated accumulator point after processing 4 bits (cols 4,5)
25//! - `n'`: Updated accumulated scalar: `n' = 16*n + 8*b1 + 4*b2 + 2*b3 + b4`
26//!
27//! # Endomorphism-optimized scalar multiplication
28//!
29//! For curves of the form y^2 = x^3 + b (like Pallas and Vesta), there exists
30//! an efficient endomorphism phi defined by:
31//!
32//!   phi(x, y) = (endo * x, y)
33//!
34//! where `endo` (also called xi) is a primitive cube root of unity in the base
35//! field. This works because (endo * x)^3 = endo^3 * x^3 = x^3, so the point
36//! remains on the curve.
37//!
38//! This endomorphism corresponds to scalar multiplication by lambda:
39//!
40//!   phi(T) = [lambda]T
41//!
42//! where lambda is a primitive cube root of unity in the scalar field.
43//!
44//! ## How the optimization works
45//!
46//! The key insight is that we can compute `P + phi(T)` or `P - phi(T)` almost
47//! as cheaply as `P + T` or `P - T`, because applying phi only requires
48//! multiplying the x-coordinate by `endo`.
49//!
50//! For a 2-bit window (b1, b2), we can encode 4 different point operations:
51//!
52//! | b1 | b2 | Point Q added to accumulator P |
53//! |----|----|--------------------|
54//! |  0 |  0 |  -T                |
55//! |  0 |  1 |   T                |
56//! |  1 |  0 |  -phi(T)           |
57//! |  1 |  1 |   phi(T)           |
58//!
59//! This is achieved by:
60//! - `xq = (1 + (endo - 1) * b1) * x_T` = x_T if b1=0, or endo * x_T if b1=1
61//! - `yq = (2 * b2 - 1) * y_T` = -y_T if b2=0, or y_T if b2=1
62//!
63//! So (xq, yq) represents one of {T, -T, phi(T), -phi(T)} based on (b1, b2).
64//!
65//! ## Why phi(T)? The GLV optimization
66//!
67//! When we want to compute `[k]T` for a large scalar `k`, a standard
68//! variable-base method uses roughly one double-and-add update per scalar bit
69//! (~256 updates for a 256-bit scalar). The GLV method
70//! (Gallant-Lambert-Vanstone) cuts this roughly in half.
71//!
72//! The key insight is that any scalar k can be decomposed as:
73//!
74//!   k = k1 + k2 * lambda (mod r)
75//!
76//! where k1, k2 are roughly half the bit-length of k (about 128 bits each).
77//! Since `phi(T) = [lambda]T`, we can rewrite:
78//!
79//!   [k]T = [k1]T + [k2][lambda]T = [k1]T + [k2]phi(T)
80//!
81//! Now instead of one 256-bit scalar multiplication, we have two 128-bit scalar
82//! multiplications: `[k1]T` and `[k2]phi(T)`. But we can do even better by
83//! computing both **simultaneously** using a multi-scalar multiplication
84//! approach.
85//!
86//! In each step, we process one bit from k1 and one bit from k2 together. The
87//! 2-bit encoding (b1, b2) selects which combination of T and phi(T) to add:
88//!
89//! - b1 selects between T (b1=0) and phi(T) (b1=1)
90//! - b2 selects the sign: negative (b2=0) or positive (b2=1)
91//!
92//! | b1 | b2 | Point added |
93//! |----|----|-------------|
94//! |  0 |  0 |  -T         |
95//! |  0 |  1 |   T         |
96//! |  1 |  0 |  -phi(T)    |
97//! |  1 |  1 |   phi(T)    |
98//!
99//! The negative points come from the y-coordinate formula: `yq = (2*b2 - 1)*y_T`.
100//! When b2=0, we get `-y_T`, which negates the point (since `-P = (x, -y)` on
101//! elliptic curves). We need both positive and negative points to encode the
102//! scalar using a **signed digit representation**. With 2 bits we represent 4
103//! distinct values `{-1, +1} x {T, phi(T)}`, which is more expressive than just
104//! `{0, 1} x {T, phi(T)}`. This signed representation is part of what makes the
105//! GLV method efficient - it allows the scalar decomposition to use both
106//! positive and negative contributions.
107//!
108//! This interleaves the bits of k1 and k2, processing one bit of each per
109//! accumulator update. Since k1 and k2 are ~128 bits, we need only ~128 updates
110//! instead of ~256, halving the circuit size.
111//!
112//! The gate processes 4 bits per row (two consecutive accumulator updates),
113//! so a 128-bit scalar requires 32 rows of EndoMul gates.
114//!
115//! ## Protocol fit
116//!
117//! In Kimchi/Snarky terminology, this gate enforces the **EC_endoscale**
118//! point-update constraint (the point side of endomorphism-optimized scaling
119//! rounds). In Pickles terms, this corresponds to endomorphism-optimized
120//! point updates used with scalar challenges.
121//!
122//! This gate is used to implement recursive verifier IPA/bulletproof
123//! point-folding logic efficiently (GLV endomorphism optimization), including
124//! repeated accumulator updates of the form `A <- (A + Q) + A`.
125//!
126//! Typical protocol usage:
127//!
128//! - **Wrap proofs**: used as part of normal wrap-circuit verification of step
129//!   proofs (part of the wrap recursion flow).
130//! - **Step proofs (recursive setting)**: used when the step circuit verifies
131//!   previous proofs (e.g. `max_proofs_verified > 0`).
132//! - **Non-recursive step circuits**: not inherently required; if no recursive
133//!   verification gadget is instantiated, this gate need not be active.
134//!
135//! ## Usage
136//!
137//! To compute `[scalar] * base` for a 128-bit scalar:
138//!
139//! 1. **Set up gates**: Create 32 consecutive EndoMul gates (128 bits / 4 bits
140//!    per row), followed by one Zero gate. The Zero gate is required because
141//!    each EndoMul gate reads the accumulator from the next row.
142//!
143//! 2. **Compute initial accumulator**: To avoid point-at-infinity edge cases,
144//!    initialize the accumulator as `acc0 = 2 * (T + phi(T))` where T is the
145//!    base point and phi is the endomorphism.
146//!
147//! 3. **Prepare scalar bits**: Convert the scalar to bits in **MSB-first**
148//!    order (most significant bit at index 0).
149//!
150//! 4. **Generate witness**: Call `gen_witness` with the witness array, starting
151//!    row, endo coefficient, base point coordinates, MSB-first bits, and
152//!    initial accumulator. The function returns the final accumulated point
153//!    and the reconstructed scalar value.
154//!
155//! See `kimchi/src/tests/endomul.rs` for a complete example.
156//!
157//! ## Invariants
158//!
159//! The following invariants **must** be respected:
160//!
161//! 1. **Bit count**: `bits.len()` must be a multiple of 4.
162//!
163//! 2. **Bit order**: Bits must be in **MSB-first** order (most significant bit
164//!    at index 0).
165//!
166//! 3. **Gate chain**: For `n` bits, you need `n/4` consecutive EndoMul gates,
167//!    followed by a Zero gate (or any gate that doesn't constrain the EndoMul
168//!    output columns). The Zero gate is needed because EndoMul reads from the
169//!    next row.
170//!
171//! 4. **Initial accumulator**: `acc0` must not be the point at infinity. The
172//!    standard initialization is `acc0 = 2 * (T + phi(T))` where T is the base
173//!    point. This ensures the accumulator never hits the point at infinity
174//!    during computation.
175//!
176//! 5. **Endo coefficient**: The `endo` parameter must be the correct cube root
177//!    of unity for the curve, obtained via `endos::<Curve>()`.
178//!
179//! 6. **Base point consistency**: The base point `(x_T, y_T)` must be the same
180//!    across all rows of a single scalar multiplication.
181//!
182//! 7. **Scalar value verification**: The EndoMul gate only constrains the
183//!    row-to-row relationship `n' = 16*n + 8*b1 + 4*b2 + 2*b3 + b4`. It does
184//!    **not** constrain the initial or final value of `n`. The calling circuit
185//!    must add external constraints.
186//!    To enforce:
187//!    - Initial `n = 0` at the first EndoMul row
188//!    - Final `n = k` where `k` is the expected scalar value
189//!
190//! ## References
191//!
192//! - Halo paper, Section 6.2: <https://eprint.iacr.org/2019/1021>
193//! - GLV method: <https://www.iacr.org/archive/crypto2001/21390189.pdf>
194
195use crate::{
196    circuits::{
197        argument::{Argument, ArgumentEnv, ArgumentType},
198        berkeley_columns::{BerkeleyChallengeTerm, BerkeleyChallenges},
199        constraints::ConstraintSystem,
200        expr::{
201            self,
202            constraints::{boolean, ExprOps},
203            Cache,
204        },
205        gate::{CircuitGate, GateType},
206        wires::{GateWires, COLUMNS},
207    },
208    curve::KimchiCurve,
209    proof::{PointEvaluations, ProofEvaluations},
210};
211use ark_ff::{Field, PrimeField};
212use core::marker::PhantomData;
213
214//~ We implement custom gate constraints for short Weierstrass curve
215//~ endomorphism optimized variable base scalar multiplication.
216//~
217//~ Given a finite field $\mathbb{F}_{q}$ of order $q$, if the order is not a
218//~ multiple of 2 nor 3, then an
219//~ elliptic curve over $\mathbb{F}_{q}$ in short Weierstrass form is
220//~ represented by the set of points $(x,y)$ that satisfy the following
221//~ equation with $a,b\in\mathbb{F}_{q}$ and $4a^3+27b^2\neq_{\mathbb{F}_q} 0$:
222//~ $$E(\mathbb{F}_q): y^2 = x^3 + a x + b$$
223//~ If $P=(x_p, y_p)$ and $T=(x_t, y_t)$ are two points in the curve
224//~ $E(\mathbb{F}_q)$, the goal of this operation is to compute
225//~ $S = (P + Q) + P$ where $Q \in \{T, -T, \phi(T), -\phi(T)\}$ is determined
226//~ by bits $(b_1, b_2)$. Here $\phi$ is the curve endomorphism
227//~ $\phi(x,y) = (\mathtt{endo} \cdot x, y)$.
228//~
229//~ The bits encode the point $Q$ as follows:
230//~ * $b_1 = 0$: use $T$, i.e., $x_q = x_t$
231//~ * $b_1 = 1$: use $\phi(T)$, i.e., $x_q = \mathtt{endo} \cdot x_t$
232//~ * $b_2 = 0$: negate, i.e., $y_q = -y_t$
233//~ * $b_2 = 1$: keep sign, i.e., $y_q = y_t$
234//~
235//~ This technique allows processing 2 bits of the scalar per point operation.
236//~ Since each row performs two such operations (using bits $b_1, b_2$ and then
237//~ $b_3, b_4$), we process 4 bits per row.
238//~
239//~ In particular, the constraints of this gate take care of 4 bits of the
240//~ scalar within a single EVBSM row. When the scalar is longer (which will
241//~ usually be the case), multiple EVBSM rows will be concatenated.
242//~
243//~ | Row | 0  | 1  | 2    | 3 | 4  | 5  | 6  | 7   | 8   | 9   | 10  | 11  | 12  | 13  | 14  | Type  |
244//~ |-----|----|----|------|---|----|----|----|-----|-----|-----|-----|-----|-----|-----|-----|-------|
245//~ |   i | xT | yT | inv  | Ø | xP | yP | n  | xR  | yR  | s1  | s3  | b1  | b2  | b3  | b4  | EVBSM |
246//~ | i+1 | =  | =  | inv' |   | xS | yS | n' | xR' | yR' | s1' | s3' | b1' | b2' | b3' | b4' | EVBSM |
247//~
248//~ The gate performs two accumulator updates per row, each of the form
249//~ `A <- (A + Q) + A = 2A + Q`.
250//~
251//~ - First, bits `(b1, b2)` select `Q1` in `{T, -T, \phi(T), -\phi(T)}`, and the
252//~ stored point `R = (xR, yR)` is the output of the first update: `R = (P + Q1) + P`.
253//~ - Second, bits `(b3, b4)` select `Q2` in the same set, and the stored point
254//~ `S = (xS, yS)` is the output of the second update: `S = (R + Q2) + R`.
255//~
256//~ The intermediate sums `P + Q1` and `R + Q2` are not stored as witness
257//~ columns. On the next row, `(xS, yS)` becomes the new `(xP, yP)`, and
258//~ `(xR', yR')` is the next row's first-update output.
259//~
260//~ The variables (`xT`, `yT`), (`xP`, `yP`), (`xR`, `yR`), and (`xS`, `yS`)
261//~ are the corresponding affine coordinates of points `T`, `P`, `R`, and `S`.
262//~
263//~ `n` and `n'` are accumulated scalar prefixes in MSB-first order, where `n'`
264//~ extends `n` with the next 4-bit chunk encoded by `b1..b4` with `n ≤ n'``.
265//~ `s1` and `s3` are intermediary values used to compute the slopes from the
266//~ curve addition formula.
267//~
268//~ The layout of this gate (and the next row) allows for this chained behavior where the output point
269//~ of the current row $S$ gets accumulated as one of the inputs of the following row, becoming $P$ in
270//~ the next constraints. Similarly, the scalar is decomposed into binary form and $n$ ($n'$ respectively)
271//~ will store the current accumulated value and the next one for the check.
272//~
273//~ For readability, we define the following variables for the constraints:
274//~
275//~ * `endo` $:=$ `EndoCoefficient`
276//~ * `xq1` $:= (1 + ($`endo`$ - 1)\cdot b_1) \cdot x_t$
277//~ * `xq2` $:= (1 + ($`endo`$ - 1)\cdot b_3) \cdot x_t$
278//~ * `yq1` $:= (2\cdot b_2 - 1) \cdot y_t$
279//~ * `yq2` $:= (2\cdot b_4 - 1) \cdot y_t$
280//~
281//~ Note: each row is performing two additions, so we use two selected points:
282//~ `Q1 := (xq1, yq1)` from bits `(b1, b2)` and `Q2 := (xq2, yq2)` from bits
283//~ `(b3, b4)`. They are points, and each is selected from
284//~ `Q:={T, -T, \phi(T), -\phi(T)}` by its corresponding bit pair. That means:
285//~
286//~ Selection table for the first selected point `Q1`:
287//~
288//~ | b1 | b2 | Q1       | (xq1, yq1)               |
289//~ |----|----|----------|--------------------------|
290//~ | 0  | 0  | -T       | (x_t, -y_t)              |
291//~ | 0  | 1  |  T       | (x_t,  y_t)              |
292//~ | 1  | 0  | -\phi(T) | (`endo` \cdot x_t, -y_t) |
293//~ | 1  | 1  |  \phi(T) | (`endo` \cdot x_t,  y_t) |
294//~
295//~ Selection table for the second selected point `Q2`:
296//~
297//~ | b3 | b4 | Q2       | (xq2, yq2)               |
298//~ |----|----|----------|--------------------------|
299//~ | 0  | 0  | -T       | (x_t, -y_t)              |
300//~ | 0  | 1  |  T       | (x_t,  y_t)              |
301//~ | 1  | 0  | -\phi(T) | (`endo` \cdot x_t, -y_t) |
302//~ | 1  | 1  |  \phi(T) | (`endo` \cdot x_t,  y_t) |
303//~
304//~ These are the 12 constraints that correspond to each EVBSM gate,
305//~ which take care of 4 bits of the scalar within a single EVBSM row:
306//~
307//~ * First block:
308//~   * `(xq1 - xp) * s1 = yq1 - yp`
309//~   * `(2*xp - s1^2 + xq1) * ((xp - xr)*s1 + yr + yp) = (xp - xr) * 2*yp`
310//~   * `(yr + yp)^2 = (xp – xr)^2 * (s1^2 – xq1 + xr)`
311//~ * Second block:
312//~   * `(xq2 - xr) * s3 = yq2 - yr`
313//~   * `(2*xr - s3^2 + xq2) * ((xr - xs)*s3 + ys + yr) = (xr - xs) * 2*yr`
314//~   * `(ys + yr)^2 = (xr – xs)^2 * (s3^2 – xq2 + xs)`
315//~ * Booleanity checks:
316//~   * Bit flag $b_1$: `0 = b1 * (b1 - 1)`
317//~   * Bit flag $b_2$: `0 = b2 * (b2 - 1)`
318//~   * Bit flag $b_3$: `0 = b3 * (b3 - 1)`
319//~   * Bit flag $b_4$: `0 = b4 * (b4 - 1)`
320//~ * Binary decomposition:
321//~   * Accumulated scalar: `n' = 16 * n + 8 * b1 + 4 * b2 + 2 * b3 + b4`
322//~ * Distinct point checks:
323//~   * `(xp - xr) * (xr - xs) * inv = 1`
324//~     - Note: if `xp = xr` (equiv `xr = xs`) then we see `(yr + yp)^2 = 0`
325//~       from constraint 3, and so we are necessarily in the disallowed
326//~       degenerate case `P=-R` (`xp = xr` and `yr = -yp`).
327//~
328//~ Note: in the EC derivation below, `R` and `S` are local symbols inside each
329//~ block's addition formulas. The witness columns still follow the row layout
330//~ above (`xP, yP` as input, `xR, yR` after the first update, `xS, yS` after
331//~ the second update).
332//~
333//~ The constraints above are derived from the following EC Affine arithmetic
334//~ equations.
335//~
336//~ **Background on EC point addition/doubling:**
337//~
338//~ For points P = (x_p, y_p) and Q = (x_q, y_q) on a short Weierstrass curve,
339//~ the sum R = P + Q = (x_r, y_r) is computed as:
340//~
341//~ * Slope: $s = (y_q - y_p) / (x_q - x_p)$
342//~ * $x_r = s^2 - x_p - x_q$
343//~ * $y_r = s \cdot (x_p - x_r) - y_p$
344//~
345//~ For point doubling R = 2P:
346//~
347//~ * Slope: $s = (3 x_p^2 + a) / (2 y_p)$ (where a=0 for our curves)
348//~ * $x_r = s^2 - 2 \cdot x_p$
349//~ * $y_r = s \cdot (x_p - x_r) - y_p$
350//~
351//~ **Derivation of the constraints:**
352//~
353//~ Each "block" computes S = (P + Q) + P where Q = (xq, yq) is determined by
354//~ bits. The intermediate point R = P + Q and final point S = R + P.
355//~
356//~ We use two slopes:
357//~ * $s_1$: slope for P + Q -> R
358//~ * $s_2$: slope for R + P -> S
359//~
360//~ The key optimization is eliminating $s_2$ from the constraints by
361//~ substituting:
362//~
363//~ * (1) => $(x_{q_1} - x_p) \cdot s_1 = y_{q_1} - y_p$
364//~ * (2&3) => $(x_p – x_r) \cdot s_2 = y_r + y_p$
365//~ * (2) => $(2 \cdot x_p + x_{q_1} – s_1^2) \cdot (s_1 + s_2) = 2 \cdot y_p$
366//~   * <=> $(2 x_p - s_1^2 + x_{q_1})((x_p - x_r) s_1 + y_r + y_p)$
367//~         $= (x_p - x_r) \cdot 2 y_p$
368//~ * (3) => $s_1^2 - s_2^2 = x_{q_1} - x_r$
369//~   * <=> $(y_r + y_p)^2 = (x_p – x_r)^2 \cdot (s_1^2 – x_{q_1} + x_r)$
370//~ *
371//~ * (4) => $(x_{q_2} - x_r) \cdot s_3 = y_{q_2} - y_r$
372//~ * (5&6) => $(x_r – x_s) \cdot s_4 = y_s + y_r$
373//~ * (5) => $(2 \cdot x_r + x_{q_2} – s_3^2) \cdot (s_3 + s_4) = 2 \cdot y_r$
374//~   * <=> $(2 x_r - s_3^2 + x_{q_2})((x_r - x_s) s_3 + y_s + y_r)$
375//~         $= (x_r - x_s) \cdot 2 y_r$
376//~ * (6) => $s_3^2 – s_4^2 = x_{q_2} - x_s$
377//~   * <=> $(y_s + y_r)^2 = (x_r – x_s)^2 \cdot (s_3^2 – x_{q_2} + x_s)$
378//~
379//~ Defining $s_2$ and $s_4$ as
380//~
381//~ * $s_2 := \frac{2 \cdot y_P}{2 * x_P + x_{q_1} - s_1^2} - s_1$
382//~ * $s_4 := \frac{2 \cdot y_R}{2 * x_R + x_{q_2} - s_3^2} - s_3$
383//~
384//~ Gives the following equations when substituting $s_2$ and $s_4$:
385//~
386//~ 1. `(xq1 - xp) * s1 = yq1 - yp` (i.e., `(2 * b2 - 1) * yt - yp`)
387//~ 2. `(2*xp - s1^2 + xq1) * ((xp - xr)*s1 + yr + yp) = (xp - xr) * 2*yp`
388//~ 3. `(yr + yp)^2 = (xp – xr)^2 * (s1^2 – xq1 + xr)`
389//~
390//~ 4. `(xq2 - xr) * s3 = yq2 - yr` (i.e., `(2 * b4 - 1) * yt - yr`)
391//~ 5. `(2*xr - s3^2 + xq2) * ((xr - xs)*s3 + ys + yr) = (xr - xs) * 2*yr`
392//~ 6. `(ys + yr)^2 = (xr – xs)^2 * (s3^2 – xq2 + xs)`
393//~
394
395/// Implementation of group endomorphism optimized
396/// variable base scalar multiplication custom Plonk constraints.
397impl<F: PrimeField> CircuitGate<F> {
398    pub fn create_endomul(wires: GateWires) -> Self {
399        CircuitGate::new(GateType::EndoMul, wires, vec![])
400    }
401
402    /// Verify the `EndoMul` gate.
403    ///
404    /// # Errors
405    ///
406    /// Will give error if `self.typ` is not `GateType::EndoMul`, or if
407    /// constraint evaluation fails.
408    pub fn verify_endomul<
409        const FULL_ROUNDS: usize,
410        G: KimchiCurve<FULL_ROUNDS, ScalarField = F>,
411    >(
412        &self,
413        row: usize,
414        witness: &[Vec<F>; COLUMNS],
415        cs: &ConstraintSystem<F>,
416    ) -> Result<(), String> {
417        ensure_eq!(self.typ, GateType::EndoMul, "incorrect gate type");
418
419        let this: [F; COLUMNS] = core::array::from_fn(|i| witness[i][row]);
420        let next: [F; COLUMNS] = core::array::from_fn(|i| witness[i][row + 1]);
421
422        let pt = F::from(123456u64);
423
424        let constants = expr::Constants {
425            mds: &G::sponge_params().mds,
426            endo_coefficient: cs.endo,
427            zk_rows: cs.zk_rows,
428        };
429        let challenges = BerkeleyChallenges {
430            alpha: F::zero(),
431            beta: F::zero(),
432            gamma: F::zero(),
433            joint_combiner: F::zero(),
434        };
435
436        let evals: ProofEvaluations<PointEvaluations<G::ScalarField>> =
437            ProofEvaluations::dummy_with_witness_evaluations(this, next);
438
439        let constraints = EndosclMul::constraints(&mut Cache::default());
440        for (i, c) in constraints.iter().enumerate() {
441            match c.evaluate_(cs.domain.d1, pt, &evals, &constants, &challenges) {
442                Ok(x) => {
443                    if x != F::zero() {
444                        return Err(format!("Bad endo equation {i}"));
445                    }
446                }
447                Err(e) => return Err(format!("evaluation failed: {e}")),
448            }
449        }
450
451        Ok(())
452    }
453
454    pub fn endomul(&self) -> F {
455        if self.typ == GateType::EndoMul {
456            F::one()
457        } else {
458            F::zero()
459        }
460    }
461}
462
463/// Implementation of the `EndosclMul` gate.
464#[derive(Default)]
465pub struct EndosclMul<F>(PhantomData<F>);
466
467impl<F> Argument<F> for EndosclMul<F>
468where
469    F: PrimeField,
470{
471    const ARGUMENT_TYPE: ArgumentType = ArgumentType::Gate(GateType::EndoMul);
472    const CONSTRAINTS: u32 = 12;
473
474    fn constraint_checks<T: ExprOps<F, BerkeleyChallengeTerm>>(
475        env: &ArgumentEnv<F, T>,
476        cache: &mut Cache,
477    ) -> Vec<T> {
478        let b1 = env.witness_curr(11);
479        let b2 = env.witness_curr(12);
480        let b3 = env.witness_curr(13);
481        let b4 = env.witness_curr(14);
482
483        let xt = env.witness_curr(0);
484        let yt = env.witness_curr(1);
485
486        let inv = env.witness_curr(2);
487
488        let xs = env.witness_next(4);
489        let ys = env.witness_next(5);
490
491        let xp = env.witness_curr(4);
492        let yp = env.witness_curr(5);
493
494        let xr = env.witness_curr(7);
495        let yr = env.witness_curr(8);
496
497        let s1 = env.witness_curr(9);
498        let s3 = env.witness_curr(10);
499
500        let endo_minus_1 = env.endo_coefficient() - T::one();
501        let xq1 = cache.cache((T::one() + b1.clone() * endo_minus_1.clone()) * xt.clone());
502        let xq2 = cache.cache((T::one() + b3.clone() * endo_minus_1) * xt);
503
504        let yq1 = (b2.double() - T::one()) * yt.clone();
505        let yq2 = (b4.double() - T::one()) * yt;
506
507        let s1_squared = cache.cache(s1.square());
508        let s3_squared = cache.cache(s3.square());
509
510        // n_next = 16*n + 8*b1 + 4*b2 + 2*b3 + b4
511        let n = env.witness_curr(6);
512        let n_next = env.witness_next(6);
513        let n_constraint =
514            (((n.double() + b1.clone()).double() + b2.clone()).double() + b3.clone()).double()
515                + b4.clone()
516                - n_next;
517
518        let xp_xr = cache.cache(xp.clone() - xr.clone());
519        let xr_xs = cache.cache(xr.clone() - xs.clone());
520
521        let ys_yr = cache.cache(ys + yr.clone());
522        let yr_yp = cache.cache(yr.clone() + yp.clone());
523
524        vec![
525            // verify booleanity of the scalar bits
526            boolean(&b1),
527            boolean(&b2),
528            boolean(&b3),
529            boolean(&b4),
530            // (xq1 - xp) * s1 = yq1 - yp
531            ((xq1.clone() - xp.clone()) * s1.clone()) - (yq1 - yp.clone()),
532            // (2*xp – s1^2 + xq1) * ((xp - xr) * s1 + yr + yp) = (xp - xr) * 2*yp
533            (((xp.double() - s1_squared.clone()) + xq1.clone())
534                * ((xp_xr.clone() * s1) + yr_yp.clone()))
535                - (yp.double() * xp_xr.clone()),
536            // (yr + yp)^2 = (xp – xr)^2 * (s1^2 – xq1 + xr)
537            yr_yp.square() - (xp_xr.clone().square() * ((s1_squared - xq1) + xr.clone())),
538            // (xq2 - xr) * s3 = yq2 - yr
539            ((xq2.clone() - xr.clone()) * s3.clone()) - (yq2 - yr.clone()),
540            // (2*xr – s3^2 + xq2) * ((xr – xs) * s3 + ys + yr) = (xr - xs) * 2*yr
541            (((xr.double() - s3_squared.clone()) + xq2.clone())
542                * ((xr_xs.clone() * s3) + ys_yr.clone()))
543                - (yr.double() * xr_xs.clone()),
544            // (ys + yr)^2 = (xr – xs)^2 * (s3^2 – xq2 + xs)
545            ys_yr.square() - (xr_xs.clone().square() * ((s3_squared - xq2) + xs)),
546            n_constraint,
547            // (xp - xr) * (xr - xs) * inv = 1
548            xp_xr * xr_xs * inv - T::one(),
549        ]
550    }
551}
552
553/// The result of performing an endomorphism-optimized scalar multiplication.
554///
555/// After processing all scalar bits through the EndoMul gates, this struct
556/// holds:
557/// - The final accumulated curve point (as affine coordinates)
558/// - The reconstructed scalar value from the processed bits
559pub struct EndoMulResult<F> {
560    /// The final accumulated point (x, y) after all scalar multiplication
561    /// steps.
562    /// This equals `[scalar]T` where `T` is the base point and `scalar` is
563    /// derived from the input bits combined with the endomorphism.
564    pub acc: (F, F),
565    /// The accumulated scalar value reconstructed from all processed bits.
566    /// For a 128-bit scalar processed in 32 rows (4 bits/row), this equals
567    /// the original scalar k such that `acc = [k]T` (with endomorphism
568    /// applied).
569    pub n: F,
570}
571
572/// Generates the witness values for a series of EndoMul gates.
573///
574/// This function computes the witness for endomorphism-optimized scalar
575/// multiplication. It processes 4 bits of the scalar per row, computing
576/// the intermediate curve points and slopes needed for the constraints.
577///
578/// # Arguments
579///
580/// * `w` - The witness array to populate (15 columns x num_rows)
581/// * `row0` - The starting row index
582/// * `endo` - The endomorphism coefficient (cube root of unity in base field)
583/// * `base` - The base point T = (x_T, y_T) being multiplied
584/// * `bits` - Scalar bits in MSB-first order. Length must be a multiple of 4.
585/// * `acc0` - Initial accumulator point. Typically set to `2*(T + phi(T))` to
586///   avoid edge cases with the point at infinity.
587///
588/// # Returns
589///
590/// The final accumulated point and scalar after processing all bits.
591///
592/// # Wire Layout (per row)
593///
594/// | Col |  0  |  1  |  4  |  5  |  6  |  7  |  8  |  9  | 10  | 11  | 12  | 13  | 14  |
595/// |-----|-----|-----|-----|-----|-----|-----|-----|-----|-----|-----|-----|-----|-----|
596/// |     | x_T | y_T | x_P | y_P |  n  | x_R | y_R | s1  | s3  | b1  | b2  | b3  | b4  |
597///
598/// # Panics
599///
600/// Will panic if `bits` length is not a multiple of 4.
601pub fn gen_witness<F: Field + core::fmt::Display>(
602    w: &mut [Vec<F>; COLUMNS],
603    row0: usize,
604    endo: F,
605    base: (F, F),
606    bits: &[bool],
607    acc0: (F, F),
608) -> EndoMulResult<F> {
609    let bits_per_row = 4;
610    let rows = bits.len() / 4;
611    assert_eq!(0, bits.len() % 4);
612
613    let bits: Vec<_> = bits.iter().map(|x| F::from(u64::from(*x))).collect();
614    let one = F::one();
615
616    let mut acc = acc0;
617    let mut n_acc = F::zero();
618
619    // TODO: Could be more efficient
620    for i in 0..rows {
621        let b1 = bits[i * bits_per_row];
622        let b2 = bits[i * bits_per_row + 1];
623        let b3 = bits[i * bits_per_row + 2];
624        let b4 = bits[i * bits_per_row + 3];
625
626        let (xt, yt) = base;
627        let (xp, yp) = acc;
628
629        let xq1 = (one + (endo - one) * b1) * xt;
630        let yq1 = (b2.double() - one) * yt;
631
632        let s1 = (yq1 - yp) / (xq1 - xp);
633        let s1_squared = s1.square();
634        // (2*xp – s1^2 + xq) * ((xp – xr) * s1 + yr + yp) = (xp – xr) * 2*yp
635        // => 2 yp / (2*xp – s1^2 + xq) = s1 + (yr + yp) / (xp – xr)
636        // => 2 yp / (2*xp – s1^2 + xq) - s1 = (yr + yp) / (xp – xr)
637        //
638        // s2 := 2 yp / (2*xp – s1^2 + xq) - s1
639        //
640        // (yr + yp)^2 = (xp – xr)^2 * (s1^2 – xq1 + xr)
641        // => (s1^2 – xq1 + xr) = (yr + yp)^2 / (xp – xr)^2
642        //
643        // => xr = s2^2 - s1^2 + xq
644        // => yr = s2 * (xp - xr) - yp
645        let s2 = yp.double() / (xp.double() + xq1 - s1_squared) - s1;
646
647        // (xr, yr)
648        let xr = xq1 + s2.square() - s1_squared;
649        let xp_xr = xp - xr;
650        let yr = xp_xr * s2 - yp;
651
652        let xq2 = (one + (endo - one) * b3) * xt;
653        let yq2 = (b4.double() - one) * yt;
654        let s3 = (yq2 - yr) / (xq2 - xr);
655        let s3_squared = s3.square();
656        let s4 = yr.double() / (xr.double() + xq2 - s3_squared) - s3;
657
658        let xs = xq2 + s4.square() - s3_squared;
659        let xr_xs = xr - xs;
660        let ys = xr_xs * s4 - yr;
661
662        let inv = (xp_xr * xr_xs)
663            .inverse()
664            .expect("xr to be distinct from xp and xs");
665
666        let row = i + row0;
667
668        w[0][row] = base.0;
669        w[1][row] = base.1;
670        w[2][row] = inv;
671        w[4][row] = xp;
672        w[5][row] = yp;
673        w[6][row] = n_acc;
674        w[7][row] = xr;
675        w[8][row] = yr;
676        w[9][row] = s1;
677        w[10][row] = s3;
678        w[11][row] = b1;
679        w[12][row] = b2;
680        w[13][row] = b3;
681        w[14][row] = b4;
682
683        acc = (xs, ys);
684
685        n_acc.double_in_place();
686        n_acc += b1;
687        n_acc.double_in_place();
688        n_acc += b2;
689        n_acc.double_in_place();
690        n_acc += b3;
691        n_acc.double_in_place();
692        n_acc += b4;
693    }
694    w[4][row0 + rows] = acc.0;
695    w[5][row0 + rows] = acc.1;
696    w[6][row0 + rows] = n_acc;
697
698    EndoMulResult { acc, n: n_acc }
699}