Skip to content

Commit de4cd59

Browse files
committed
Add 1024 degree polynomial test
1 parent cea6817 commit de4cd59

File tree

2 files changed

+274
-0
lines changed

2 files changed

+274
-0
lines changed

test/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,7 @@ add_executable(test_perf test_perf.cc)
2323
target_include_directories(test_perf PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
2424
target_link_libraries(test_perf cufhe_gpu ${CUDA_LIBRARIES})
2525

26+
add_executable(test_polynomial_mult_1024 test_polynomial_mult_1024.cu)
27+
target_include_directories(test_polynomial_mult_1024 PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
28+
target_link_libraries(test_polynomial_mult_1024 cufhe_gpu ${CUDA_LIBRARIES})
29+

test/test_polynomial_mult_1024.cu

Lines changed: 270 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,270 @@
1+
/**
2+
* Test for 1024-degree polynomial multiplication
3+
* This test performs polynomial multiplication of 1024-degree polynomials
4+
* with 32-bit coefficient polynomial times up to 20-bit coefficient polynomial mod 32 bit
5+
*/
6+
7+
#include <cassert>
8+
#include <iostream>
9+
#include <random>
10+
#include <vector>
11+
#include <cstdint>
12+
#include <algorithm>
13+
#include <cmath>
14+
15+
#include <include/ntt_gpu/ntt.cuh>
16+
#include <include/details/error_gpu.cuh>
17+
#include <include/cufhe_gpu.cuh>
18+
19+
using namespace std;
20+
using namespace cufhe;
21+
22+
// Test parameters for 1024-degree polynomial
23+
constexpr uint32_t POLY_SIZE = 1024;
24+
constexpr uint32_t NUM_TESTS = 100;
25+
26+
// Coefficient bounds
27+
constexpr uint32_t COEFF_32BIT_MAX = 0xFFFFFFFF; // 32-bit max
28+
constexpr uint32_t COEFF_20BIT_MAX = 0xFFFFF; // 20-bit max
29+
30+
// Helper function to generate random polynomial with bounded coefficients
31+
void generateRandomPolynomial(vector<uint32_t>& poly, uint32_t max_value, default_random_engine& engine) {
32+
uniform_int_distribution<uint32_t> dist(0, max_value);
33+
for (auto& coeff : poly) {
34+
coeff = dist(engine);
35+
}
36+
}
37+
38+
// CPU reference implementation of negacyclic polynomial multiplication
39+
// Result: c = a * b mod (X^n + 1) mod 2^32
40+
void polynomialMultiplicationCPU(const vector<uint32_t>& a,
41+
const vector<uint32_t>& b,
42+
vector<uint32_t>& result) {
43+
// Initialize result with zeros
44+
fill(result.begin(), result.end(), 0);
45+
46+
// Naive polynomial multiplication with negacyclic reduction
47+
for (uint32_t i = 0; i < POLY_SIZE; i++) {
48+
for (uint32_t j = 0; j < POLY_SIZE; j++) {
49+
uint64_t prod = static_cast<uint64_t>(a[i]) * static_cast<uint64_t>(b[j]);
50+
51+
if (i + j < POLY_SIZE) {
52+
// Normal case: add to result[i+j]
53+
result[i + j] = (result[i + j] + static_cast<uint32_t>(prod)) & COEFF_32BIT_MAX;
54+
} else {
55+
// Wraparound with negation (negacyclic)
56+
uint32_t idx = (i + j) - POLY_SIZE;
57+
// Subtract instead of add due to X^n = -1
58+
result[idx] = (result[idx] - static_cast<uint32_t>(prod)) & COEFF_32BIT_MAX;
59+
}
60+
}
61+
}
62+
}
63+
64+
// GPU kernel for forward NTT transformation
65+
__global__ void ForwardNTT(FFP* d_out_ntt,
66+
const uint32_t* d_in,
67+
CuNTTHandler<1024> ntt_handler) {
68+
__shared__ FFP sh_temp[POLY_SIZE];
69+
ntt_handler.NTT<uint32_t>(d_out_ntt, d_in, sh_temp, 0);
70+
}
71+
72+
// GPU kernel for pointwise multiplication in NTT domain
73+
__global__ void PointwiseMultiply(FFP* d_result_ntt,
74+
const FFP* d_a_ntt,
75+
const FFP* d_b_ntt) {
76+
const uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
77+
if (tid < POLY_SIZE) {
78+
d_result_ntt[tid] = d_a_ntt[tid] * d_b_ntt[tid];
79+
}
80+
}
81+
82+
// GPU kernel for inverse NTT transformation
83+
__global__ void InverseNTT(uint32_t* d_out,
84+
const FFP* d_in_ntt,
85+
CuNTTHandler<1024> ntt_handler) {
86+
__shared__ FFP sh_temp[POLY_SIZE];
87+
ntt_handler.NTTInv<uint32_t>(d_out, d_in_ntt, sh_temp, 0);
88+
}
89+
90+
int main() {
91+
cout << "=== Test for 1024-degree Polynomial Multiplication ===" << endl;
92+
cout << "Polynomial degree: " << POLY_SIZE << endl;
93+
cout << "Number of tests: " << NUM_TESTS << endl;
94+
cout << "First polynomial coefficients: up to 32-bit" << endl;
95+
cout << "Second polynomial coefficients: up to 20-bit" << endl;
96+
cout << endl;
97+
98+
// Initialize random number generator
99+
random_device seed_gen;
100+
default_random_engine engine(seed_gen());
101+
102+
// Initialize NTT handler
103+
CuNTTHandler<1024>* ntt_handler = new CuNTTHandler<1024>();
104+
ntt_handler->Create();
105+
ntt_handler->CreateConstant();
106+
cudaDeviceSynchronize();
107+
CuCheckError();
108+
109+
// Test variables
110+
vector<uint32_t> poly_a(POLY_SIZE);
111+
vector<uint32_t> poly_b(POLY_SIZE);
112+
vector<uint32_t> cpu_result(POLY_SIZE);
113+
vector<uint32_t> gpu_result(POLY_SIZE);
114+
115+
// Device memory
116+
uint32_t* d_poly_a;
117+
uint32_t* d_poly_b;
118+
uint32_t* d_result;
119+
FFP* d_a_ntt;
120+
FFP* d_b_ntt;
121+
FFP* d_result_ntt;
122+
123+
cudaMalloc((void**)&d_poly_a, POLY_SIZE * sizeof(uint32_t));
124+
cudaMalloc((void**)&d_poly_b, POLY_SIZE * sizeof(uint32_t));
125+
cudaMalloc((void**)&d_result, POLY_SIZE * sizeof(uint32_t));
126+
cudaMalloc((void**)&d_a_ntt, POLY_SIZE * sizeof(FFP));
127+
cudaMalloc((void**)&d_b_ntt, POLY_SIZE * sizeof(FFP));
128+
cudaMalloc((void**)&d_result_ntt, POLY_SIZE * sizeof(FFP));
129+
CuCheckError();
130+
131+
// Run tests
132+
int passed_tests = 0;
133+
int failed_tests = 0;
134+
135+
// Configure kernel launch parameters
136+
// NTT kernels use 128 threads (1024 >> NTT_THREAD_UNITBIT where NTT_THREAD_UNITBIT = 3)
137+
dim3 ntt_block(POLY_SIZE >> NTT_THREAD_UNITBIT); // 128 threads
138+
dim3 ntt_grid(1);
139+
140+
// Pointwise multiplication can use more threads
141+
dim3 mult_block(256);
142+
dim3 mult_grid((POLY_SIZE + mult_block.x - 1) / mult_block.x);
143+
144+
for (uint32_t test = 0; test < NUM_TESTS; test++) {
145+
cout << "Test " << (test + 1) << "/" << NUM_TESTS << ": ";
146+
147+
// Generate random polynomials with appropriate bounds
148+
// First polynomial with up to 32-bit coefficients
149+
generateRandomPolynomial(poly_a, COEFF_32BIT_MAX, engine);
150+
// Second polynomial with up to 20-bit coefficients
151+
generateRandomPolynomial(poly_b, COEFF_20BIT_MAX, engine);
152+
153+
// For initial testing, use smaller values to verify correctness
154+
if (test < 10) {
155+
generateRandomPolynomial(poly_a, 0xFFFF, engine); // 16-bit for initial tests
156+
generateRandomPolynomial(poly_b, 0xFF, engine); // 8-bit for initial tests
157+
}
158+
159+
// CPU computation (reference)
160+
polynomialMultiplicationCPU(poly_a, poly_b, cpu_result);
161+
162+
// Copy data to device
163+
cudaMemcpy(d_poly_a, poly_a.data(), POLY_SIZE * sizeof(uint32_t), cudaMemcpyHostToDevice);
164+
cudaMemcpy(d_poly_b, poly_b.data(), POLY_SIZE * sizeof(uint32_t), cudaMemcpyHostToDevice);
165+
CuCheckError();
166+
167+
// GPU computation using NTT
168+
// Step 1: Forward NTT on polynomial A
169+
ForwardNTT<<<ntt_grid, ntt_block>>>(d_a_ntt, d_poly_a, *ntt_handler);
170+
cudaDeviceSynchronize();
171+
CuCheckError();
172+
173+
// Step 2: Forward NTT on polynomial B
174+
ForwardNTT<<<ntt_grid, ntt_block>>>(d_b_ntt, d_poly_b, *ntt_handler);
175+
cudaDeviceSynchronize();
176+
CuCheckError();
177+
178+
// Step 3: Pointwise multiplication in NTT domain
179+
PointwiseMultiply<<<mult_grid, mult_block>>>(d_result_ntt, d_a_ntt, d_b_ntt);
180+
cudaDeviceSynchronize();
181+
CuCheckError();
182+
183+
// Step 4: Inverse NTT to get the result
184+
InverseNTT<<<ntt_grid, ntt_block>>>(d_result, d_result_ntt, *ntt_handler);
185+
cudaDeviceSynchronize();
186+
CuCheckError();
187+
188+
// Copy result back to host
189+
cudaMemcpy(gpu_result.data(), d_result, POLY_SIZE * sizeof(uint32_t), cudaMemcpyDeviceToHost);
190+
CuCheckError();
191+
192+
// Compare results
193+
bool test_passed = true;
194+
uint32_t max_diff = 0;
195+
uint32_t num_mismatches = 0;
196+
197+
for (uint32_t i = 0; i < POLY_SIZE; i++) {
198+
uint32_t diff = abs(static_cast<int64_t>(cpu_result[i]) - static_cast<int64_t>(gpu_result[i]));
199+
max_diff = max(max_diff, diff);
200+
201+
// Allow small numerical errors due to modular arithmetic
202+
if (diff > 2) {
203+
test_passed = false;
204+
num_mismatches++;
205+
if (num_mismatches <= 5 && failed_tests < 3) { // Show details for first few failures
206+
cout << "\n Mismatch at index " << i << ": ";
207+
cout << "CPU=" << cpu_result[i] << ", GPU=" << gpu_result[i];
208+
cout << ", diff=" << diff;
209+
}
210+
}
211+
}
212+
213+
if (test_passed) {
214+
cout << "PASSED (max_diff=" << max_diff << ")" << endl;
215+
passed_tests++;
216+
} else {
217+
cout << "FAILED (mismatches=" << num_mismatches << ", max_diff=" << max_diff << ")" << endl;
218+
failed_tests++;
219+
220+
// Debug: Check if NTT results are non-zero
221+
if (failed_tests == 1) {
222+
vector<FFP> h_a_ntt(POLY_SIZE);
223+
cudaMemcpy(h_a_ntt.data(), d_a_ntt, POLY_SIZE * sizeof(FFP), cudaMemcpyDeviceToHost);
224+
bool all_zero = true;
225+
for (int i = 0; i < 10; i++) {
226+
if (h_a_ntt[i].val() != 0) {
227+
all_zero = false;
228+
break;
229+
}
230+
}
231+
if (all_zero) {
232+
cout << " WARNING: NTT result appears to be all zeros!" << endl;
233+
} else {
234+
cout << " NTT result has non-zero values (first element: " << h_a_ntt[0].val() << ")" << endl;
235+
}
236+
}
237+
}
238+
}
239+
240+
// Cleanup
241+
cudaFree(d_poly_a);
242+
cudaFree(d_poly_b);
243+
cudaFree(d_result);
244+
cudaFree(d_a_ntt);
245+
cudaFree(d_b_ntt);
246+
cudaFree(d_result_ntt);
247+
248+
ntt_handler->Destroy();
249+
delete ntt_handler;
250+
251+
// Summary
252+
cout << "\n=== Test Summary ===" << endl;
253+
cout << "Passed: " << passed_tests << "/" << NUM_TESTS << endl;
254+
cout << "Failed: " << failed_tests << "/" << NUM_TESTS << endl;
255+
256+
if (failed_tests == 0) {
257+
cout << "\n*** ALL TESTS PASSED! ***" << endl;
258+
cout << "1024-degree polynomial multiplication is working correctly." << endl;
259+
cout << "The implementation successfully handles:" << endl;
260+
cout << " - 32-bit coefficient polynomials" << endl;
261+
cout << " - 20-bit coefficient polynomials" << endl;
262+
cout << " - Negacyclic polynomial multiplication mod (X^1024 + 1)" << endl;
263+
} else {
264+
cout << "\n*** SOME TESTS FAILED ***" << endl;
265+
cout << "The NTT-based polynomial multiplication may need debugging." << endl;
266+
return 1;
267+
}
268+
269+
return 0;
270+
}

0 commit comments

Comments
 (0)