1+ /* *
2+ * Test for 1024-degree polynomial multiplication
3+ * This test performs polynomial multiplication of 1024-degree polynomials
4+ * with 32-bit coefficient polynomial times up to 20-bit coefficient polynomial mod 32 bit
5+ */
6+
7+ #include < cassert>
8+ #include < iostream>
9+ #include < random>
10+ #include < vector>
11+ #include < cstdint>
12+ #include < algorithm>
13+ #include < cmath>
14+
15+ #include < include/ntt_gpu/ntt.cuh>
16+ #include < include/details/error_gpu.cuh>
17+ #include < include/cufhe_gpu.cuh>
18+
19+ using namespace std ;
20+ using namespace cufhe ;
21+
22+ // Test parameters for 1024-degree polynomial
23+ constexpr uint32_t POLY_SIZE = 1024 ;
24+ constexpr uint32_t NUM_TESTS = 100 ;
25+
26+ // Coefficient bounds
27+ constexpr uint32_t COEFF_32BIT_MAX = 0xFFFFFFFF ; // 32-bit max
28+ constexpr uint32_t COEFF_20BIT_MAX = 0xFFFFF ; // 20-bit max
29+
30+ // Helper function to generate random polynomial with bounded coefficients
31+ void generateRandomPolynomial (vector<uint32_t >& poly, uint32_t max_value, default_random_engine& engine) {
32+ uniform_int_distribution<uint32_t > dist (0 , max_value);
33+ for (auto & coeff : poly) {
34+ coeff = dist (engine);
35+ }
36+ }
37+
38+ // CPU reference implementation of negacyclic polynomial multiplication
39+ // Result: c = a * b mod (X^n + 1) mod 2^32
40+ void polynomialMultiplicationCPU (const vector<uint32_t >& a,
41+ const vector<uint32_t >& b,
42+ vector<uint32_t >& result) {
43+ // Initialize result with zeros
44+ fill (result.begin (), result.end (), 0 );
45+
46+ // Naive polynomial multiplication with negacyclic reduction
47+ for (uint32_t i = 0 ; i < POLY_SIZE; i++) {
48+ for (uint32_t j = 0 ; j < POLY_SIZE; j++) {
49+ uint64_t prod = static_cast <uint64_t >(a[i]) * static_cast <uint64_t >(b[j]);
50+
51+ if (i + j < POLY_SIZE) {
52+ // Normal case: add to result[i+j]
53+ result[i + j] = (result[i + j] + static_cast <uint32_t >(prod)) & COEFF_32BIT_MAX;
54+ } else {
55+ // Wraparound with negation (negacyclic)
56+ uint32_t idx = (i + j) - POLY_SIZE;
57+ // Subtract instead of add due to X^n = -1
58+ result[idx] = (result[idx] - static_cast <uint32_t >(prod)) & COEFF_32BIT_MAX;
59+ }
60+ }
61+ }
62+ }
63+
64+ // GPU kernel for forward NTT transformation
65+ __global__ void ForwardNTT (FFP* d_out_ntt,
66+ const uint32_t * d_in,
67+ CuNTTHandler<1024 > ntt_handler) {
68+ __shared__ FFP sh_temp[POLY_SIZE];
69+ ntt_handler.NTT <uint32_t >(d_out_ntt, d_in, sh_temp, 0 );
70+ }
71+
72+ // GPU kernel for pointwise multiplication in NTT domain
73+ __global__ void PointwiseMultiply (FFP* d_result_ntt,
74+ const FFP* d_a_ntt,
75+ const FFP* d_b_ntt) {
76+ const uint32_t tid = blockIdx .x * blockDim .x + threadIdx .x ;
77+ if (tid < POLY_SIZE) {
78+ d_result_ntt[tid] = d_a_ntt[tid] * d_b_ntt[tid];
79+ }
80+ }
81+
82+ // GPU kernel for inverse NTT transformation
83+ __global__ void InverseNTT (uint32_t * d_out,
84+ const FFP* d_in_ntt,
85+ CuNTTHandler<1024 > ntt_handler) {
86+ __shared__ FFP sh_temp[POLY_SIZE];
87+ ntt_handler.NTTInv <uint32_t >(d_out, d_in_ntt, sh_temp, 0 );
88+ }
89+
90+ int main () {
91+ cout << " === Test for 1024-degree Polynomial Multiplication ===" << endl;
92+ cout << " Polynomial degree: " << POLY_SIZE << endl;
93+ cout << " Number of tests: " << NUM_TESTS << endl;
94+ cout << " First polynomial coefficients: up to 32-bit" << endl;
95+ cout << " Second polynomial coefficients: up to 20-bit" << endl;
96+ cout << endl;
97+
98+ // Initialize random number generator
99+ random_device seed_gen;
100+ default_random_engine engine (seed_gen ());
101+
102+ // Initialize NTT handler
103+ CuNTTHandler<1024 >* ntt_handler = new CuNTTHandler<1024 >();
104+ ntt_handler->Create ();
105+ ntt_handler->CreateConstant ();
106+ cudaDeviceSynchronize ();
107+ CuCheckError ();
108+
109+ // Test variables
110+ vector<uint32_t > poly_a (POLY_SIZE);
111+ vector<uint32_t > poly_b (POLY_SIZE);
112+ vector<uint32_t > cpu_result (POLY_SIZE);
113+ vector<uint32_t > gpu_result (POLY_SIZE);
114+
115+ // Device memory
116+ uint32_t * d_poly_a;
117+ uint32_t * d_poly_b;
118+ uint32_t * d_result;
119+ FFP* d_a_ntt;
120+ FFP* d_b_ntt;
121+ FFP* d_result_ntt;
122+
123+ cudaMalloc ((void **)&d_poly_a, POLY_SIZE * sizeof (uint32_t ));
124+ cudaMalloc ((void **)&d_poly_b, POLY_SIZE * sizeof (uint32_t ));
125+ cudaMalloc ((void **)&d_result, POLY_SIZE * sizeof (uint32_t ));
126+ cudaMalloc ((void **)&d_a_ntt, POLY_SIZE * sizeof (FFP));
127+ cudaMalloc ((void **)&d_b_ntt, POLY_SIZE * sizeof (FFP));
128+ cudaMalloc ((void **)&d_result_ntt, POLY_SIZE * sizeof (FFP));
129+ CuCheckError ();
130+
131+ // Run tests
132+ int passed_tests = 0 ;
133+ int failed_tests = 0 ;
134+
135+ // Configure kernel launch parameters
136+ // NTT kernels use 128 threads (1024 >> NTT_THREAD_UNITBIT where NTT_THREAD_UNITBIT = 3)
137+ dim3 ntt_block (POLY_SIZE >> NTT_THREAD_UNITBIT); // 128 threads
138+ dim3 ntt_grid (1 );
139+
140+ // Pointwise multiplication can use more threads
141+ dim3 mult_block (256 );
142+ dim3 mult_grid ((POLY_SIZE + mult_block.x - 1 ) / mult_block.x );
143+
144+ for (uint32_t test = 0 ; test < NUM_TESTS; test++) {
145+ cout << " Test " << (test + 1 ) << " /" << NUM_TESTS << " : " ;
146+
147+ // Generate random polynomials with appropriate bounds
148+ // First polynomial with up to 32-bit coefficients
149+ generateRandomPolynomial (poly_a, COEFF_32BIT_MAX, engine);
150+ // Second polynomial with up to 20-bit coefficients
151+ generateRandomPolynomial (poly_b, COEFF_20BIT_MAX, engine);
152+
153+ // For initial testing, use smaller values to verify correctness
154+ if (test < 10 ) {
155+ generateRandomPolynomial (poly_a, 0xFFFF , engine); // 16-bit for initial tests
156+ generateRandomPolynomial (poly_b, 0xFF , engine); // 8-bit for initial tests
157+ }
158+
159+ // CPU computation (reference)
160+ polynomialMultiplicationCPU (poly_a, poly_b, cpu_result);
161+
162+ // Copy data to device
163+ cudaMemcpy (d_poly_a, poly_a.data (), POLY_SIZE * sizeof (uint32_t ), cudaMemcpyHostToDevice);
164+ cudaMemcpy (d_poly_b, poly_b.data (), POLY_SIZE * sizeof (uint32_t ), cudaMemcpyHostToDevice);
165+ CuCheckError ();
166+
167+ // GPU computation using NTT
168+ // Step 1: Forward NTT on polynomial A
169+ ForwardNTT<<<ntt_grid, ntt_block>>> (d_a_ntt, d_poly_a, *ntt_handler);
170+ cudaDeviceSynchronize ();
171+ CuCheckError ();
172+
173+ // Step 2: Forward NTT on polynomial B
174+ ForwardNTT<<<ntt_grid, ntt_block>>> (d_b_ntt, d_poly_b, *ntt_handler);
175+ cudaDeviceSynchronize ();
176+ CuCheckError ();
177+
178+ // Step 3: Pointwise multiplication in NTT domain
179+ PointwiseMultiply<<<mult_grid, mult_block>>> (d_result_ntt, d_a_ntt, d_b_ntt);
180+ cudaDeviceSynchronize ();
181+ CuCheckError ();
182+
183+ // Step 4: Inverse NTT to get the result
184+ InverseNTT<<<ntt_grid, ntt_block>>> (d_result, d_result_ntt, *ntt_handler);
185+ cudaDeviceSynchronize ();
186+ CuCheckError ();
187+
188+ // Copy result back to host
189+ cudaMemcpy (gpu_result.data (), d_result, POLY_SIZE * sizeof (uint32_t ), cudaMemcpyDeviceToHost);
190+ CuCheckError ();
191+
192+ // Compare results
193+ bool test_passed = true ;
194+ uint32_t max_diff = 0 ;
195+ uint32_t num_mismatches = 0 ;
196+
197+ for (uint32_t i = 0 ; i < POLY_SIZE; i++) {
198+ uint32_t diff = abs (static_cast <int64_t >(cpu_result[i]) - static_cast <int64_t >(gpu_result[i]));
199+ max_diff = max (max_diff, diff);
200+
201+ // Allow small numerical errors due to modular arithmetic
202+ if (diff > 2 ) {
203+ test_passed = false ;
204+ num_mismatches++;
205+ if (num_mismatches <= 5 && failed_tests < 3 ) { // Show details for first few failures
206+ cout << " \n Mismatch at index " << i << " : " ;
207+ cout << " CPU=" << cpu_result[i] << " , GPU=" << gpu_result[i];
208+ cout << " , diff=" << diff;
209+ }
210+ }
211+ }
212+
213+ if (test_passed) {
214+ cout << " PASSED (max_diff=" << max_diff << " )" << endl;
215+ passed_tests++;
216+ } else {
217+ cout << " FAILED (mismatches=" << num_mismatches << " , max_diff=" << max_diff << " )" << endl;
218+ failed_tests++;
219+
220+ // Debug: Check if NTT results are non-zero
221+ if (failed_tests == 1 ) {
222+ vector<FFP> h_a_ntt (POLY_SIZE);
223+ cudaMemcpy (h_a_ntt.data (), d_a_ntt, POLY_SIZE * sizeof (FFP), cudaMemcpyDeviceToHost);
224+ bool all_zero = true ;
225+ for (int i = 0 ; i < 10 ; i++) {
226+ if (h_a_ntt[i].val () != 0 ) {
227+ all_zero = false ;
228+ break ;
229+ }
230+ }
231+ if (all_zero) {
232+ cout << " WARNING: NTT result appears to be all zeros!" << endl;
233+ } else {
234+ cout << " NTT result has non-zero values (first element: " << h_a_ntt[0 ].val () << " )" << endl;
235+ }
236+ }
237+ }
238+ }
239+
240+ // Cleanup
241+ cudaFree (d_poly_a);
242+ cudaFree (d_poly_b);
243+ cudaFree (d_result);
244+ cudaFree (d_a_ntt);
245+ cudaFree (d_b_ntt);
246+ cudaFree (d_result_ntt);
247+
248+ ntt_handler->Destroy ();
249+ delete ntt_handler;
250+
251+ // Summary
252+ cout << " \n === Test Summary ===" << endl;
253+ cout << " Passed: " << passed_tests << " /" << NUM_TESTS << endl;
254+ cout << " Failed: " << failed_tests << " /" << NUM_TESTS << endl;
255+
256+ if (failed_tests == 0 ) {
257+ cout << " \n *** ALL TESTS PASSED! ***" << endl;
258+ cout << " 1024-degree polynomial multiplication is working correctly." << endl;
259+ cout << " The implementation successfully handles:" << endl;
260+ cout << " - 32-bit coefficient polynomials" << endl;
261+ cout << " - 20-bit coefficient polynomials" << endl;
262+ cout << " - Negacyclic polynomial multiplication mod (X^1024 + 1)" << endl;
263+ } else {
264+ cout << " \n *** SOME TESTS FAILED ***" << endl;
265+ cout << " The NTT-based polynomial multiplication may need debugging." << endl;
266+ return 1 ;
267+ }
268+
269+ return 0 ;
270+ }
0 commit comments