LAMMP 4.1.0
Lamina High-Precision Arithmetic Library
载入中...
搜索中...
未找到
mul_basecase.c
浏览该文件的文档.
1/*
2 * LAMMP - Copyright (C) 2025-2026 HJimmyK(Jericho Knox)
3 * This file is part of lammp, under the GNU LGPL v2 license.
4 * See LICENSE in the project root for the full license text.
5 */
6
7#include "../../../include/lammp/lmmpn.h"
8#include "../../../include/lammp/impl/longlong.h"
9
10void lmmp_sqr_basecase_(mp_ptr restrict dst, mp_srcptr restrict numa, mp_size_t na) {
11 lmmp_param_assert(na >= 1);
12
13 mp_size_t i;
14 mp_limb_t cl, x;
15
16 x = numa[0];
17 cl = 0;
18
19 for (i = 0; i + 4 <= na; i += 4) {
20 mp_limb_t u0 = numa[i + 0], u1 = numa[i + 1], u2 = numa[i + 2], u3 = numa[i + 3];
21 mp_limb_t l0, h0, l1, h1, l2, h2, l3, h3;
22
23 _umul64to128_(u0, x, &l0, &h0);
24 l0 += cl;
25 cl = (l0 < cl) + h0;
26 _umul64to128_(u1, x, &l1, &h1);
27 l1 += cl;
28 cl = (l1 < cl) + h1;
29 _umul64to128_(u2, x, &l2, &h2);
30 l2 += cl;
31 cl = (l2 < cl) + h2;
32 _umul64to128_(u3, x, &l3, &h3);
33 l3 += cl;
34 cl = (l3 < cl) + h3;
35
36 dst[i + 0] = l0;
37 dst[i + 1] = l1;
38 dst[i + 2] = l2;
39 dst[i + 3] = l3;
40 }
41 for (; i < na; i++) {
42 mp_limb_t u = numa[i], l, h;
43 _umul64to128_(u, x, &l, &h);
44 l += cl;
45 cl = (l < cl) + h;
46 dst[i] = l;
47 }
48 dst[na] = cl;
49
50 dst++;
51 mp_size_t nb = na - 1;
52 mp_limb_t* nb_ptr = (mp_limb_t*)numa + 1;
53
54 while (nb >= 2) {
55 cl = 0;
56 x = nb_ptr[0];
57 for (i = 0; i + 4 <= na; i += 4) {
58 mp_limb_t u0 = numa[i + 0], d0 = dst[i + 0];
59 mp_limb_t u1 = numa[i + 1], d1 = dst[i + 1];
60 mp_limb_t u2 = numa[i + 2], d2 = dst[i + 2];
61 mp_limb_t u3 = numa[i + 3], d3 = dst[i + 3];
62 mp_limb_t l0, h0, l1, h1, l2, h2, l3, h3;
63
64 _umul64to128_(u0, x, &l0, &h0);
65 l0 += cl;
66 cl = (l0 < cl) + h0;
67 l0 += d0;
68 cl += (l0 < d0);
69 dst[i + 0] = l0;
70 _umul64to128_(u1, x, &l1, &h1);
71 l1 += cl;
72 cl = (l1 < cl) + h1;
73 l1 += d1;
74 cl += (l1 < d1);
75 dst[i + 1] = l1;
76 _umul64to128_(u2, x, &l2, &h2);
77 l2 += cl;
78 cl = (l2 < cl) + h2;
79 l2 += d2;
80 cl += (l2 < d2);
81 dst[i + 2] = l2;
82 _umul64to128_(u3, x, &l3, &h3);
83 l3 += cl;
84 cl = (l3 < cl) + h3;
85 l3 += d3;
86 cl += (l3 < d3);
87 dst[i + 3] = l3;
88 }
89 for (; i < na; i++) {
90 mp_limb_t u = numa[i], d = dst[i], l, h;
91 _umul64to128_(u, x, &l, &h);
92 l += cl;
93 cl = (l < cl) + h;
94 l += d;
95 cl += (l < d);
96 dst[i] = l;
97 }
98 dst[na] = cl;
99 dst++;
100
101 cl = 0;
102 x = nb_ptr[1];
103 for (i = 0; i + 4 <= na; i += 4) {
104 mp_limb_t u0 = numa[i + 0], d0 = dst[i + 0];
105 mp_limb_t u1 = numa[i + 1], d1 = dst[i + 1];
106 mp_limb_t u2 = numa[i + 2], d2 = dst[i + 2];
107 mp_limb_t u3 = numa[i + 3], d3 = dst[i + 3];
108 mp_limb_t l0, h0, l1, h1, l2, h2, l3, h3;
109
110 _umul64to128_(u0, x, &l0, &h0);
111 l0 += cl;
112 cl = (l0 < cl) + h0;
113 l0 += d0;
114 cl += (l0 < d0);
115 dst[i + 0] = l0;
116 _umul64to128_(u1, x, &l1, &h1);
117 l1 += cl;
118 cl = (l1 < cl) + h1;
119 l1 += d1;
120 cl += (l1 < d1);
121 dst[i + 1] = l1;
122 _umul64to128_(u2, x, &l2, &h2);
123 l2 += cl;
124 cl = (l2 < cl) + h2;
125 l2 += d2;
126 cl += (l2 < d2);
127 dst[i + 2] = l2;
128 _umul64to128_(u3, x, &l3, &h3);
129 l3 += cl;
130 cl = (l3 < cl) + h3;
131 l3 += d3;
132 cl += (l3 < d3);
133 dst[i + 3] = l3;
134 }
135 for (; i < na; i++) {
136 mp_limb_t u = numa[i], d = dst[i], l, h;
137 _umul64to128_(u, x, &l, &h);
138 l += cl;
139 cl = (l < cl) + h;
140 l += d;
141 cl += (l < d);
142 dst[i] = l;
143 }
144 dst[na] = cl;
145 dst++;
146
147 nb_ptr += 2;
148 nb -= 2;
149 }
150
151 while (nb >= 1) {
152 cl = 0;
153 x = nb_ptr[0];
154 for (i = 0; i + 4 <= na; i += 4) {
155 mp_limb_t u0 = numa[i + 0], d0 = dst[i + 0];
156 mp_limb_t u1 = numa[i + 1], d1 = dst[i + 1];
157 mp_limb_t u2 = numa[i + 2], d2 = dst[i + 2];
158 mp_limb_t u3 = numa[i + 3], d3 = dst[i + 3];
159 mp_limb_t l0, h0, l1, h1, l2, h2, l3, h3;
160
161 _umul64to128_(u0, x, &l0, &h0);
162 l0 += cl;
163 cl = (l0 < cl) + h0;
164 l0 += d0;
165 cl += (l0 < d0);
166 dst[i + 0] = l0;
167 _umul64to128_(u1, x, &l1, &h1);
168 l1 += cl;
169 cl = (l1 < cl) + h1;
170 l1 += d1;
171 cl += (l1 < d1);
172 dst[i + 1] = l1;
173 _umul64to128_(u2, x, &l2, &h2);
174 l2 += cl;
175 cl = (l2 < cl) + h2;
176 l2 += d2;
177 cl += (l2 < d2);
178 dst[i + 2] = l2;
179 _umul64to128_(u3, x, &l3, &h3);
180 l3 += cl;
181 cl = (l3 < cl) + h3;
182 l3 += d3;
183 cl += (l3 < d3);
184 dst[i + 3] = l3;
185 }
186 for (; i < na; i++) {
187 mp_limb_t u = numa[i], d = dst[i], l, h;
188 _umul64to128_(u, x, &l, &h);
189 l += cl;
190 cl = (l < cl) + h;
191 l += d;
192 cl += (l < d);
193 dst[i] = l;
194 }
195 dst[na] = cl;
196 dst++;
197 nb_ptr++;
198 nb--;
199 }
200}
202 mp_ptr restrict dst,
203 mp_srcptr restrict numa,
204 mp_size_t na,
205 mp_srcptr restrict numb,
206 mp_size_t nb
207) {
208 lmmp_param_assert(na >= nb);
209 lmmp_param_assert(nb >= 1);
210
211 mp_size_t i;
212 mp_limb_t cl;
213
214 cl = 0;
215 mp_limb_t x = numb[0];
216 for (i = 0; i + 4 <= na; i += 4) {
217 mp_limb_t u0 = numa[i + 0], u1 = numa[i + 1], u2 = numa[i + 2], u3 = numa[i + 3];
218 mp_limb_t l0, h0, l1, h1, l2, h2, l3, h3;
219
220 _umul64to128_(u0, x, &l0, &h0);
221 l0 += cl;
222 cl = (l0 < cl) + h0;
223 _umul64to128_(u1, x, &l1, &h1);
224 l1 += cl;
225 cl = (l1 < cl) + h1;
226 _umul64to128_(u2, x, &l2, &h2);
227 l2 += cl;
228 cl = (l2 < cl) + h2;
229 _umul64to128_(u3, x, &l3, &h3);
230 l3 += cl;
231 cl = (l3 < cl) + h3;
232
233 dst[i + 0] = l0;
234 dst[i + 1] = l1;
235 dst[i + 2] = l2;
236 dst[i + 3] = l3;
237 }
238 for (; i < na; i++) {
239 mp_limb_t u = numa[i], l, h;
240 _umul64to128_(u, x, &l, &h);
241 l += cl;
242 cl = (l < cl) + h;
243 dst[i] = l;
244 }
245 dst[na] = cl;
246 dst++;
247 numb++;
248 nb--;
249
250 while (nb >= 2) {
251 // 第一个乘数
252 cl = 0;
253 x = numb[0];
254 for (i = 0; i + 4 <= na; i += 4) {
255 mp_limb_t u0 = numa[i + 0], d0 = dst[i + 0];
256 mp_limb_t u1 = numa[i + 1], d1 = dst[i + 1];
257 mp_limb_t u2 = numa[i + 2], d2 = dst[i + 2];
258 mp_limb_t u3 = numa[i + 3], d3 = dst[i + 3];
259 mp_limb_t l0, h0, l1, h1, l2, h2, l3, h3;
260
261 _umul64to128_(u0, x, &l0, &h0);
262 l0 += cl;
263 cl = (l0 < cl) + h0;
264 l0 += d0;
265 cl += (l0 < d0);
266 dst[i + 0] = l0;
267 _umul64to128_(u1, x, &l1, &h1);
268 l1 += cl;
269 cl = (l1 < cl) + h1;
270 l1 += d1;
271 cl += (l1 < d1);
272 dst[i + 1] = l1;
273 _umul64to128_(u2, x, &l2, &h2);
274 l2 += cl;
275 cl = (l2 < cl) + h2;
276 l2 += d2;
277 cl += (l2 < d2);
278 dst[i + 2] = l2;
279 _umul64to128_(u3, x, &l3, &h3);
280 l3 += cl;
281 cl = (l3 < cl) + h3;
282 l3 += d3;
283 cl += (l3 < d3);
284 dst[i + 3] = l3;
285 }
286 for (; i < na; i++) {
287 mp_limb_t u = numa[i], d = dst[i], l, h;
288 _umul64to128_(u, x, &l, &h);
289 l += cl;
290 cl = (l < cl) + h;
291 l += d;
292 cl += (l < d);
293 dst[i] = l;
294 }
295 dst[na] = cl;
296
297 dst++;
298 cl = 0;
299 x = numb[1];
300 for (i = 0; i + 4 <= na; i += 4) {
301 mp_limb_t u0 = numa[i + 0], d0 = dst[i + 0];
302 mp_limb_t u1 = numa[i + 1], d1 = dst[i + 1];
303 mp_limb_t u2 = numa[i + 2], d2 = dst[i + 2];
304 mp_limb_t u3 = numa[i + 3], d3 = dst[i + 3];
305 mp_limb_t l0, h0, l1, h1, l2, h2, l3, h3;
306
307 _umul64to128_(u0, x, &l0, &h0);
308 l0 += cl;
309 cl = (l0 < cl) + h0;
310 l0 += d0;
311 cl += (l0 < d0);
312 dst[i + 0] = l0;
313 _umul64to128_(u1, x, &l1, &h1);
314 l1 += cl;
315 cl = (l1 < cl) + h1;
316 l1 += d1;
317 cl += (l1 < d1);
318 dst[i + 1] = l1;
319 _umul64to128_(u2, x, &l2, &h2);
320 l2 += cl;
321 cl = (l2 < cl) + h2;
322 l2 += d2;
323 cl += (l2 < d2);
324 dst[i + 2] = l2;
325 _umul64to128_(u3, x, &l3, &h3);
326 l3 += cl;
327 cl = (l3 < cl) + h3;
328 l3 += d3;
329 cl += (l3 < d3);
330 dst[i + 3] = l3;
331 }
332 for (; i < na; i++) {
333 mp_limb_t u = numa[i], d = dst[i], l, h;
334 _umul64to128_(u, x, &l, &h);
335 l += cl;
336 cl = (l < cl) + h;
337 l += d;
338 cl += (l < d);
339 dst[i] = l;
340 }
341 dst[na] = cl;
342
343 dst++;
344 numb += 2;
345 nb -= 2;
346 }
347
348 while (nb >= 1) {
349 cl = 0;
350 x = numb[0];
351 for (i = 0; i + 4 <= na; i += 4) {
352 mp_limb_t u0 = numa[i + 0], d0 = dst[i + 0];
353 mp_limb_t u1 = numa[i + 1], d1 = dst[i + 1];
354 mp_limb_t u2 = numa[i + 2], d2 = dst[i + 2];
355 mp_limb_t u3 = numa[i + 3], d3 = dst[i + 3];
356 mp_limb_t l0, h0, l1, h1, l2, h2, l3, h3;
357
358 _umul64to128_(u0, x, &l0, &h0);
359 l0 += cl;
360 cl = (l0 < cl) + h0;
361 l0 += d0;
362 cl += (l0 < d0);
363 dst[i + 0] = l0;
364 _umul64to128_(u1, x, &l1, &h1);
365 l1 += cl;
366 cl = (l1 < cl) + h1;
367 l1 += d1;
368 cl += (l1 < d1);
369 dst[i + 1] = l1;
370 _umul64to128_(u2, x, &l2, &h2);
371 l2 += cl;
372 cl = (l2 < cl) + h2;
373 l2 += d2;
374 cl += (l2 < d2);
375 dst[i + 2] = l2;
376 _umul64to128_(u3, x, &l3, &h3);
377 l3 += cl;
378 cl = (l3 < cl) + h3;
379 l3 += d3;
380 cl += (l3 < d3);
381 dst[i + 3] = l3;
382 }
383 for (; i < na; i++) {
384 mp_limb_t u = numa[i], d = dst[i], l, h;
385 _umul64to128_(u, x, &l, &h);
386 l += cl;
387 cl = (l < cl) + h;
388 l += d;
389 cl += (l < d);
390 dst[i] = l;
391 }
392 dst[na] = cl;
393 dst++;
394 numb++;
395 nb--;
396 }
397}
mp_limb_t * mp_ptr
Definition lmmp.h:215
uint64_t mp_size_t
Definition lmmp.h:212
const mp_limb_t * mp_srcptr
Definition lmmp.h:216
uint64_t mp_limb_t
Definition lmmp.h:211
#define lmmp_param_assert(x)
Definition lmmp.h:398
static void _umul64to128_(uint64_t a, uint64_t b, uint64_t *low, uint64_t *high)
Definition longlong.h:31
void lmmp_mul_basecase_(mp_ptr restrict dst, mp_srcptr restrict numa, mp_size_t na, mp_srcptr restrict numb, mp_size_t nb)
void lmmp_sqr_basecase_(mp_ptr restrict dst, mp_srcptr restrict numa, mp_size_t na)