genearlize rotate-and-reduce kernel implementation to support ct-ct BSGS

j2kun · j2kun · commit cc216d2a64cb · 2025-10-31T10:35:26.000-07:00
diff --git a/lib/Kernel/KernelImplementation.h b/lib/Kernel/KernelImplementation.h
@@ -49,41 +49,57 @@ implementMatvec(KernelName kernelName, const T& matrix, const T& vector) {
   return accumulatedSum;
 }
 
-// Returns an arithmetic DAG that implements a rotate and reduce op. Ensure
-// this is only generated for T a subclass of AbstractValue.
+// Returns an arithmetic DAG that implements a logarithmic rotate-and-reduce
+// accumulation of an input ciphertext.
+//
+// This is a special case of `tensor_ext.rotate_and_reduce`
 template <typename T>
 std::enable_if_t<std::is_base_of<AbstractValue, T>::value,
                  std::shared_ptr<ArithmeticDagNode<T>>>
-implementRotateAndReduce(const T& vector, std::optional<T> plaintexts,
-                         int64_t period, int64_t steps,
-                         const std::string& reduceOp = "arith.addi") {
+implementRotateAndReduceAccumulation(
+    const T& vector, int64_t period, int64_t steps,
+    std::function<std::shared_ptr<ArithmeticDagNode<T>>(
+        std::shared_ptr<ArithmeticDagNode<T>>,
+        std::shared_ptr<ArithmeticDagNode<T>>)>
+        reduceFunc) {
   using NodeTy = ArithmeticDagNode<T>;
   auto vectorDag = NodeTy::leaf(vector);
 
-  auto performReduction = [&](std::shared_ptr<NodeTy> left,
-                              std::shared_ptr<NodeTy> right) {
-    if (reduceOp == "arith.addi" || reduceOp == "arith.addf") {
-      return NodeTy::add(left, right);
-    }
-
-    if (reduceOp == "arith.muli" || reduceOp == "arith.mulf") {
-      return NodeTy::mul(left, right);
-    }
-
-    // Default to add for unknown operations
-    return NodeTy::add(left, right);
-  };
-
-  if (!plaintexts.has_value()) {
-    for (int64_t shiftSize = steps / 2; shiftSize > 0; shiftSize /= 2) {
-      auto rotated = NodeTy::leftRotate(vectorDag, shiftSize * period);
-      auto reduced = performReduction(vectorDag, rotated);
-      vectorDag = reduced;
-    }
-    return vectorDag;
+  for (int64_t shiftSize = steps / 2; shiftSize > 0; shiftSize /= 2) {
+    auto rotated = NodeTy::leftRotate(vectorDag, shiftSize * period);
+    auto reduced = performReduction(vectorDag, rotated);
+    vectorDag = reduced;
   }
+  return vectorDag;
+}
 
-  auto plaintextsDag = NodeTy::leaf(*plaintexts);
+// Returns an arithmetic DAG that implements a baby-step-giant-step
+// rotate-and-reduce accumulation between an input ciphertext
+// (giantSteppedOperand) and an abstraction over the other argument
+// (babySteppedOperand). In particular, the babySteppedOperand may be a list of
+// plaintexts like in Halevi-Shoup matvec, or a single ciphertext like in
+// bicyclic matmul, and this abstracts over both by taking in an extraction
+// callback.
+//
+// This is a special case of `tensor_ext.rotate_and_reduce`, but with the added
+// abstractions it also supports situations not currently expressible by
+// `tensor_ext.rotate_and_reduce`.
+template <typename T>
+std::enable_if_t<std::is_base_of<AbstractValue, T>::value,
+                 std::shared_ptr<ArithmeticDagNode<T>>>
+implementBabyStepGiantStep(const T& giantSteppedOperand,
+                           const T& babySteppedOperand, int64_t period,
+                           int64_t steps,
+                           std::function<std::shared_ptr<ArithmeticDagNode<T>>(
+                               std::shared_ptr<ArithmeticDagNode<T>>, int64_t)>
+                               extractFunc,
+                           std::function<std::shared_ptr<ArithmeticDagNode<T>>(
+                               std::shared_ptr<ArithmeticDagNode<T>>,
+                               std::shared_ptr<ArithmeticDagNode<T>>)>
+                               reduceFunc) {
+  using NodeTy = ArithmeticDagNode<T>;
+  auto giantSteppedDag = NodeTy::leaf(giantSteppedOperand);
+  auto babySteppedDag = NodeTy::leaf(babySteppedOperand);
 
   // Use a value of sqrt(n) as the baby step / giant step size.
   int64_t numBabySteps = static_cast<int64_t>(std::floor(std::sqrt(steps)));
@@ -112,9 +128,9 @@ implementRotateAndReduce(const T& vector, std::optional<T> plaintexts,
 
   // Compute sqrt(n) ciphertext rotations of the input as baby-steps.
   SmallVector<std::shared_ptr<NodeTy>> babyStepVals;
-  babyStepVals.push_back(vectorDag);  // rot by zero
+  babyStepVals.push_back(giantSteppedDag);  // rot by zero
   for (int64_t i = 1; i < numBabySteps; ++i) {
-    babyStepVals.push_back(NodeTy::leftRotate(vectorDag, period * i));
+    babyStepVals.push_back(NodeTy::leftRotate(giantSteppedDag, period * i));
   }
 
   // Compute the inner baby step sums.
@@ -125,7 +141,7 @@ implementRotateAndReduce(const T& vector, std::optional<T> plaintexts,
     int64_t plaintextRotationAmount = -giantStepSize * j * period;
     for (int64_t i = 0; i < numBabySteps; ++i) {
       size_t extractionIndex = i + j * giantStepSize;
-      auto plaintext = NodeTy::extract(plaintextsDag, extractionIndex);
+      auto plaintext = extractFunc(babySteppedDag, extractionIndex);
       auto rotatedPlaintext =
           NodeTy::leftRotate(plaintext, plaintextRotationAmount);
       auto multiplied = NodeTy::mul(rotatedPlaintext, babyStepVals[i]);
@@ -141,6 +157,105 @@ implementRotateAndReduce(const T& vector, std::optional<T> plaintexts,
   return result;
 }
 
+// Returns an arithmetic DAG that implements a tensor_ext.rotate_and_reduce op.
+//
+// See TensorExtOps.td docs for RotateAndReduceOp for more details.
+//
+// The `vector` argument is a ciphertext value that will be rotated O(sqrt(n))
+// times when the `plaintexts` argument is set (Baby Step Giant Step), or
+// O(log(n)) times when the `plaintexts` argument is not set (log-style
+// rotate-and-reduce accumulation).
+//
+// The `plaintexts` argument, when present, represents a vector of pre-packed
+// plaintexts that will be rotated and multiplied with the rotated `vector`
+// argument in BSGS style.
+//
+// Note that using this kernel results in places in the pipeline where a
+// plaintext type is rotated, but most FHE implementations don't have a
+// plaintext rotation operation (it would be wasteful) and instead expect the
+// "plaintext rotation" to apply to the cleartext. HEIR has places in the
+// pipeline that support this by converting a rotate(encode(cleartext)) to
+// encode(rotate(cleartext)).
+template <typename T>
+std::enable_if_t<std::is_base_of<AbstractValue, T>::value,
+                 std::shared_ptr<ArithmeticDagNode<T>>>
+implementRotateAndReduce(const T& vector, std::optional<T> plaintexts,
+                         int64_t period, int64_t steps,
+                         const std::string& reduceOp = "arith.addi") {
+  using NodeTy = ArithmeticDagNode<T>;
+  auto performReduction = [&](std::shared_ptr<NodeTy> left,
+                              std::shared_ptr<NodeTy> right) {
+    if (reduceOp == "arith.addi" || reduceOp == "arith.addf") {
+      return NodeTy::add(left, right);
+    }
+
+    if (reduceOp == "arith.muli" || reduceOp == "arith.mulf") {
+      return NodeTy::mul(left, right);
+    }
+
+    // Default to add for unknown operations
+    return NodeTy::add(left, right);
+  };
+
+  if (!plaintexts.has_value()) {
+    return implementRotateAndReduceAccumulation<T>(vector, period, steps,
+                                                   performReduction);
+  }
+
+  auto extractFunc = [](std::shared_ptr<NodeTy> babySteppedDag,
+                        int64_t extractionIndex) {
+    return NodeTy::extract(babySteppedDag, extractionIndex);
+  };
+
+  return implementBabyStepGiantStep<T>(vector, plaintexts.value(), period,
+                                       steps, extractFunc, performReduction);
+}
+
+// Returns an arithmetic DAG that implements a baby-step-giant-step between
+// ciphertexts.
+//
+// This implements equation 21 in 6.2.2 of LKAA25: "Tricycle: Private
+// Transformer Inference with Tricyclic Encodings"
+// https://eprint.iacr.org/2025/1200
+//
+// This differs from the above implementRotateAndReduce in that, instead of a
+// set of pre-computed plaintexts, both arguments are individual ciphertexts.
+// Normally with one ciphertext, the naive approach uses n - 1 rotations that
+// BSGS reduces to c sqrt(n) + O(1) rotations, if both inputs are ciphertexts
+// then it converts 2n - 2 total rotations to n + c sqrt(n) + O(1) rotations.
+// Essentially, the "n to sqrt(n)" redution applies to the `vector` argument
+// only, while the `plaintexts` argument still gets n-1 rotations.
+template <typename T>
+std::enable_if_t<std::is_base_of<AbstractValue, T>::value,
+                 std::shared_ptr<ArithmeticDagNode<T>>>
+implementCiphertextCiphertextBabyStepGiantStep(
+    const T& giantSteppedOperand, const T& babySteppedOperand, int64_t period,
+    int64_t steps, const std::string& reduceOp = "arith.addi") {
+  using NodeTy = ArithmeticDagNode<T>;
+  auto performReduction = [&](std::shared_ptr<NodeTy> left,
+                              std::shared_ptr<NodeTy> right) {
+    if (reduceOp == "arith.addi" || reduceOp == "arith.addf") {
+      return NodeTy::add(left, right);
+    }
+
+    if (reduceOp == "arith.muli" || reduceOp == "arith.mulf") {
+      return NodeTy::mul(left, right);
+    }
+
+    // Default to add for unknown operations
+    return NodeTy::add(left, right);
+  };
+
+  // Avoid replicating and re-extracting by simulating the extraction step by
+  // just returning the single ciphertext.
+  auto extractFunc = [](std::shared_ptr<NodeTy> babySteppedDag,
+                        int64_t extractionIndex) { return babySteppedDag; };
+
+  return implementBabyStepGiantStep<T>(giantSteppedOperand, babySteppedOperand,
+                                       period, steps, extractFunc,
+                                       performReduction);
+}
+
 // Returns an arithmetic DAG that implements the Halevi-Shoup matrix
 // multiplication algorithm. This implementation uses a rotate-and-reduce
 // operation, followed by a summation of partial sums if the matrix is not