8 static uint64_t load_3(const unsigned char *in) {
12 result |= shlu64(in[1], 8);
13 result |= shlu64(in[2], 16);
18 static uint64_t load_4(const unsigned char *in) {
22 result |= shlu64(in[1], 8);
23 result |= shlu64(in[2], 16);
24 result |= shlu64(in[3], 24);
71 Can overlap h with f or g.
74 |f| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
75 |g| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
78 |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
81 void fe_add(fe h, const fe f, const fe g) {
102 int32_t h0 = f0 + g0;
103 int32_t h1 = f1 + g1;
104 int32_t h2 = f2 + g2;
105 int32_t h3 = f3 + g3;
106 int32_t h4 = f4 + g4;
107 int32_t h5 = f5 + g5;
108 int32_t h6 = f6 + g6;
109 int32_t h7 = f7 + g7;
110 int32_t h8 = f8 + g8;
111 int32_t h9 = f9 + g9;
128 Replace (f,g) with (g,g) if b == 1;
129 replace (f,g) with (f,g) if b == 0.
131 Preconditions: b in {0,1}.
134 void fe_cmov(fe f, const fe g, unsigned int b) {
155 int32_t x0 = f0 ^ g0;
156 int32_t x1 = f1 ^ g1;
157 int32_t x2 = f2 ^ g2;
158 int32_t x3 = f3 ^ g3;
159 int32_t x4 = f4 ^ g4;
160 int32_t x5 = f5 ^ g5;
161 int32_t x6 = f6 ^ g6;
162 int32_t x7 = f7 ^ g7;
163 int32_t x8 = f8 ^ g8;
164 int32_t x9 = f9 ^ g9;
166 b = (unsigned int)(- (int) b); /* silence warning */
191 Replace (f,g) with (g,f) if b == 1;
192 replace (f,g) with (f,g) if b == 0.
194 Preconditions: b in {0,1}.
197 void fe_cswap(fe f, fe g, unsigned int b) {
218 int32_t x0 = f0 ^ g0;
219 int32_t x1 = f1 ^ g1;
220 int32_t x2 = f2 ^ g2;
221 int32_t x3 = f3 ^ g3;
222 int32_t x4 = f4 ^ g4;
223 int32_t x5 = f5 ^ g5;
224 int32_t x6 = f6 ^ g6;
225 int32_t x7 = f7 ^ g7;
226 int32_t x8 = f8 ^ g8;
227 int32_t x9 = f9 ^ g9;
267 void fe_copy(fe h, const fe f) {
294 Ignores top bit of h.
297 void fe_frombytes(fe h, const unsigned char *s) {
298 int64_t h0 = load_4(s);
299 int64_t h1 = load_3(s + 4) << 6;
300 int64_t h2 = load_3(s + 7) << 5;
301 int64_t h3 = load_3(s + 10) << 3;
302 int64_t h4 = load_3(s + 13) << 2;
303 int64_t h5 = load_4(s + 16);
304 int64_t h6 = load_3(s + 20) << 7;
305 int64_t h7 = load_3(s + 23) << 5;
306 int64_t h8 = load_3(s + 26) << 4;
307 int64_t h9 = (load_3(s + 29) & 8388607) << 2;
319 carry9 = (h9 + (1L << 24)) >> 25;
321 h9 -= shl64(carry9, 25);
322 carry1 = (h1 + (1L << 24)) >> 25;
324 h1 -= shl64(carry1, 25);
325 carry3 = (h3 + (1L << 24)) >> 25;
327 h3 -= shl64(carry3, 25);
328 carry5 = (h5 + (1L << 24)) >> 25;
330 h5 -= shl64(carry5, 25);
331 carry7 = (h7 + (1L << 24)) >> 25;
333 h7 -= shl64(carry7, 25);
334 carry0 = (h0 + (1L << 25)) >> 26;
336 h0 -= shl64(carry0, 26);
337 carry2 = (h2 + (1L << 25)) >> 26;
339 h2 -= shl64(carry2, 26);
340 carry4 = (h4 + (1L << 25)) >> 26;
342 h4 -= shl64(carry4, 26);
343 carry6 = (h6 + (1L << 25)) >> 26;
345 h6 -= shl64(carry6, 26);
346 carry8 = (h8 + (1L << 25)) >> 26;
348 h8 -= shl64(carry8, 26);
364 void fe_invert(fe out, const fe z) {
373 for(i = 1; i < 1; ++i) {
379 for(i = 1; i < 2; ++i) {
387 for(i = 1; i < 1; ++i) {
394 for(i = 1; i < 5; ++i) {
401 for(i = 1; i < 10; ++i) {
408 for(i = 1; i < 20; ++i) {
415 for(i = 1; i < 10; ++i) {
422 for(i = 1; i < 50; ++i) {
429 for(i = 1; i < 100; ++i) {
436 for(i = 1; i < 50; ++i) {
443 for(i = 1; i < 5; ++i) {
453 return 1 if f is in {1,3,5,...,q-2}
454 return 0 if f is in {0,2,4,...,q-1}
457 |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
460 int fe_isnegative(const fe f) {
475 |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
478 int fe_isnonzero(const fe f) {
485 #define F(i) r |= s[i]
526 Can overlap h with f or g.
529 |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
530 |g| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
533 |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
537 Notes on implementation strategy:
539 Using schoolbook multiplication.
540 Karatsuba would save a little in some cost models.
542 Most multiplications by 2 and 19 are 32-bit precomputations;
543 cheaper than 64-bit postcomputations.
545 There is one remaining multiplication by 19 in the carry chain;
546 one *19 precomputation can be merged into this,
547 but the resulting data flow is considerably less clean.
549 There are 12 carries below.
550 10 of them are 2-way parallelizable and vectorizable.
551 Can get away with 11 carries, but then data flow is much deeper.
553 With tighter constraints on inputs can squeeze carries into int32.
556 void fe_mul(fe h, const fe f, const fe g) {
577 int32_t g1_19 = 19 * g1; /* 1.959375*2^29 */
578 int32_t g2_19 = 19 * g2; /* 1.959375*2^30; still ok */
579 int32_t g3_19 = 19 * g3;
580 int32_t g4_19 = 19 * g4;
581 int32_t g5_19 = 19 * g5;
582 int32_t g6_19 = 19 * g6;
583 int32_t g7_19 = 19 * g7;
584 int32_t g8_19 = 19 * g8;
585 int32_t g9_19 = 19 * g9;
586 int32_t f1_2 = 2 * f1;
587 int32_t f3_2 = 2 * f3;
588 int32_t f5_2 = 2 * f5;
589 int32_t f7_2 = 2 * f7;
590 int32_t f9_2 = 2 * f9;
591 int64_t f0g0 = f0 * (int64_t) g0;
592 int64_t f0g1 = f0 * (int64_t) g1;
593 int64_t f0g2 = f0 * (int64_t) g2;
594 int64_t f0g3 = f0 * (int64_t) g3;
595 int64_t f0g4 = f0 * (int64_t) g4;
596 int64_t f0g5 = f0 * (int64_t) g5;
597 int64_t f0g6 = f0 * (int64_t) g6;
598 int64_t f0g7 = f0 * (int64_t) g7;
599 int64_t f0g8 = f0 * (int64_t) g8;
600 int64_t f0g9 = f0 * (int64_t) g9;
601 int64_t f1g0 = f1 * (int64_t) g0;
602 int64_t f1g1_2 = f1_2 * (int64_t) g1;
603 int64_t f1g2 = f1 * (int64_t) g2;
604 int64_t f1g3_2 = f1_2 * (int64_t) g3;
605 int64_t f1g4 = f1 * (int64_t) g4;
606 int64_t f1g5_2 = f1_2 * (int64_t) g5;
607 int64_t f1g6 = f1 * (int64_t) g6;
608 int64_t f1g7_2 = f1_2 * (int64_t) g7;
609 int64_t f1g8 = f1 * (int64_t) g8;
610 int64_t f1g9_38 = f1_2 * (int64_t) g9_19;
611 int64_t f2g0 = f2 * (int64_t) g0;
612 int64_t f2g1 = f2 * (int64_t) g1;
613 int64_t f2g2 = f2 * (int64_t) g2;
614 int64_t f2g3 = f2 * (int64_t) g3;
615 int64_t f2g4 = f2 * (int64_t) g4;
616 int64_t f2g5 = f2 * (int64_t) g5;
617 int64_t f2g6 = f2 * (int64_t) g6;
618 int64_t f2g7 = f2 * (int64_t) g7;
619 int64_t f2g8_19 = f2 * (int64_t) g8_19;
620 int64_t f2g9_19 = f2 * (int64_t) g9_19;
621 int64_t f3g0 = f3 * (int64_t) g0;
622 int64_t f3g1_2 = f3_2 * (int64_t) g1;
623 int64_t f3g2 = f3 * (int64_t) g2;
624 int64_t f3g3_2 = f3_2 * (int64_t) g3;
625 int64_t f3g4 = f3 * (int64_t) g4;
626 int64_t f3g5_2 = f3_2 * (int64_t) g5;
627 int64_t f3g6 = f3 * (int64_t) g6;
628 int64_t f3g7_38 = f3_2 * (int64_t) g7_19;
629 int64_t f3g8_19 = f3 * (int64_t) g8_19;
630 int64_t f3g9_38 = f3_2 * (int64_t) g9_19;
631 int64_t f4g0 = f4 * (int64_t) g0;
632 int64_t f4g1 = f4 * (int64_t) g1;
633 int64_t f4g2 = f4 * (int64_t) g2;
634 int64_t f4g3 = f4 * (int64_t) g3;
635 int64_t f4g4 = f4 * (int64_t) g4;
636 int64_t f4g5 = f4 * (int64_t) g5;
637 int64_t f4g6_19 = f4 * (int64_t) g6_19;
638 int64_t f4g7_19 = f4 * (int64_t) g7_19;
639 int64_t f4g8_19 = f4 * (int64_t) g8_19;
640 int64_t f4g9_19 = f4 * (int64_t) g9_19;
641 int64_t f5g0 = f5 * (int64_t) g0;
642 int64_t f5g1_2 = f5_2 * (int64_t) g1;
643 int64_t f5g2 = f5 * (int64_t) g2;
644 int64_t f5g3_2 = f5_2 * (int64_t) g3;
645 int64_t f5g4 = f5 * (int64_t) g4;
646 int64_t f5g5_38 = f5_2 * (int64_t) g5_19;
647 int64_t f5g6_19 = f5 * (int64_t) g6_19;
648 int64_t f5g7_38 = f5_2 * (int64_t) g7_19;
649 int64_t f5g8_19 = f5 * (int64_t) g8_19;
650 int64_t f5g9_38 = f5_2 * (int64_t) g9_19;
651 int64_t f6g0 = f6 * (int64_t) g0;
652 int64_t f6g1 = f6 * (int64_t) g1;
653 int64_t f6g2 = f6 * (int64_t) g2;
654 int64_t f6g3 = f6 * (int64_t) g3;
655 int64_t f6g4_19 = f6 * (int64_t) g4_19;
656 int64_t f6g5_19 = f6 * (int64_t) g5_19;
657 int64_t f6g6_19 = f6 * (int64_t) g6_19;
658 int64_t f6g7_19 = f6 * (int64_t) g7_19;
659 int64_t f6g8_19 = f6 * (int64_t) g8_19;
660 int64_t f6g9_19 = f6 * (int64_t) g9_19;
661 int64_t f7g0 = f7 * (int64_t) g0;
662 int64_t f7g1_2 = f7_2 * (int64_t) g1;
663 int64_t f7g2 = f7 * (int64_t) g2;
664 int64_t f7g3_38 = f7_2 * (int64_t) g3_19;
665 int64_t f7g4_19 = f7 * (int64_t) g4_19;
666 int64_t f7g5_38 = f7_2 * (int64_t) g5_19;
667 int64_t f7g6_19 = f7 * (int64_t) g6_19;
668 int64_t f7g7_38 = f7_2 * (int64_t) g7_19;
669 int64_t f7g8_19 = f7 * (int64_t) g8_19;
670 int64_t f7g9_38 = f7_2 * (int64_t) g9_19;
671 int64_t f8g0 = f8 * (int64_t) g0;
672 int64_t f8g1 = f8 * (int64_t) g1;
673 int64_t f8g2_19 = f8 * (int64_t) g2_19;
674 int64_t f8g3_19 = f8 * (int64_t) g3_19;
675 int64_t f8g4_19 = f8 * (int64_t) g4_19;
676 int64_t f8g5_19 = f8 * (int64_t) g5_19;
677 int64_t f8g6_19 = f8 * (int64_t) g6_19;
678 int64_t f8g7_19 = f8 * (int64_t) g7_19;
679 int64_t f8g8_19 = f8 * (int64_t) g8_19;
680 int64_t f8g9_19 = f8 * (int64_t) g9_19;
681 int64_t f9g0 = f9 * (int64_t) g0;
682 int64_t f9g1_38 = f9_2 * (int64_t) g1_19;
683 int64_t f9g2_19 = f9 * (int64_t) g2_19;
684 int64_t f9g3_38 = f9_2 * (int64_t) g3_19;
685 int64_t f9g4_19 = f9 * (int64_t) g4_19;
686 int64_t f9g5_38 = f9_2 * (int64_t) g5_19;
687 int64_t f9g6_19 = f9 * (int64_t) g6_19;
688 int64_t f9g7_38 = f9_2 * (int64_t) g7_19;
689 int64_t f9g8_19 = f9 * (int64_t) g8_19;
690 int64_t f9g9_38 = f9_2 * (int64_t) g9_19;
691 int64_t h0 = f0g0 + f1g9_38 + f2g8_19 + f3g7_38 + f4g6_19 + f5g5_38 + f6g4_19 + f7g3_38 + f8g2_19 + f9g1_38;
692 int64_t h1 = f0g1 + f1g0 + f2g9_19 + f3g8_19 + f4g7_19 + f5g6_19 + f6g5_19 + f7g4_19 + f8g3_19 + f9g2_19;
693 int64_t h2 = f0g2 + f1g1_2 + f2g0 + f3g9_38 + f4g8_19 + f5g7_38 + f6g6_19 + f7g5_38 + f8g4_19 + f9g3_38;
694 int64_t h3 = f0g3 + f1g2 + f2g1 + f3g0 + f4g9_19 + f5g8_19 + f6g7_19 + f7g6_19 + f8g5_19 + f9g4_19;
695 int64_t h4 = f0g4 + f1g3_2 + f2g2 + f3g1_2 + f4g0 + f5g9_38 + f6g8_19 + f7g7_38 + f8g6_19 + f9g5_38;
696 int64_t h5 = f0g5 + f1g4 + f2g3 + f3g2 + f4g1 + f5g0 + f6g9_19 + f7g8_19 + f8g7_19 + f9g6_19;
697 int64_t h6 = f0g6 + f1g5_2 + f2g4 + f3g3_2 + f4g2 + f5g1_2 + f6g0 + f7g9_38 + f8g8_19 + f9g7_38;
698 int64_t h7 = f0g7 + f1g6 + f2g5 + f3g4 + f4g3 + f5g2 + f6g1 + f7g0 + f8g9_19 + f9g8_19;
699 int64_t h8 = f0g8 + f1g7_2 + f2g6 + f3g5_2 + f4g4 + f5g3_2 + f6g2 + f7g1_2 + f8g0 + f9g9_38;
700 int64_t h9 = f0g9 + f1g8 + f2g7 + f3g6 + f4g5 + f5g4 + f6g3 + f7g2 + f8g1 + f9g0 ;
712 carry0 = (h0 + (1L << 25)) >> 26;
714 h0 -= shl64(carry0, 26);
715 carry4 = (h4 + (1L << 25)) >> 26;
717 h4 -= shl64(carry4, 26);
719 carry1 = (h1 + (1L << 24)) >> 25;
721 h1 -= shl64(carry1, 25);
722 carry5 = (h5 + (1L << 24)) >> 25;
724 h5 -= shl64(carry5, 25);
726 carry2 = (h2 + (1L << 25)) >> 26;
728 h2 -= shl64(carry2, 26);
729 carry6 = (h6 + (1L << 25)) >> 26;
731 h6 -= shl64(carry6, 26);
733 carry3 = (h3 + (1L << 24)) >> 25;
735 h3 -= shl64(carry3, 25);
736 carry7 = (h7 + (1L << 24)) >> 25;
738 h7 -= shl64(carry7, 25);
740 carry4 = (h4 + (1L << 25)) >> 26;
742 h4 -= shl64(carry4, 26);
743 carry8 = (h8 + (1L << 25)) >> 26;
745 h8 -= shl64(carry8, 26);
747 carry9 = (h9 + (1L << 24)) >> 25;
749 h9 -= shl64(carry9, 25);
751 carry0 = (h0 + (1L << 25)) >> 26;
753 h0 -= shl64(carry0, 26);
770 Can overlap h with f.
773 |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
776 |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
779 void fe_mul121666(fe h, fe f) {
790 int64_t h0 = f0 * (int64_t) 121666;
791 int64_t h1 = f1 * (int64_t) 121666;
792 int64_t h2 = f2 * (int64_t) 121666;
793 int64_t h3 = f3 * (int64_t) 121666;
794 int64_t h4 = f4 * (int64_t) 121666;
795 int64_t h5 = f5 * (int64_t) 121666;
796 int64_t h6 = f6 * (int64_t) 121666;
797 int64_t h7 = f7 * (int64_t) 121666;
798 int64_t h8 = f8 * (int64_t) 121666;
799 int64_t h9 = f9 * (int64_t) 121666;
811 carry9 = (h9 + (int64_t)(1 << 24)) >> 25;
813 h9 -= shl64(carry9, 25);
814 carry1 = (h1 + (int64_t)(1 << 24)) >> 25;
816 h1 -= shl64(carry1, 25);
817 carry3 = (h3 + (int64_t)(1 << 24)) >> 25;
819 h3 -= shl64(carry3, 25);
820 carry5 = (h5 + (int64_t)(1 << 24)) >> 25;
822 h5 -= shl64(carry5, 25);
823 carry7 = (h7 + (int64_t)(1 << 24)) >> 25;
825 h7 -= shl64(carry7, 25);
827 carry0 = (h0 + (int64_t)(1 << 25)) >> 26;
829 h0 -= shl64(carry0, 26);
830 carry2 = (h2 + (int64_t)(1 << 25)) >> 26;
832 h2 -= shl64(carry2, 26);
833 carry4 = (h4 + (int64_t)(1 << 25)) >> 26;
835 h4 -= shl64(carry4, 26);
836 carry6 = (h6 + (int64_t)(1 << 25)) >> 26;
838 h6 -= shl64(carry6, 26);
839 carry8 = (h8 + (int64_t)(1 << 25)) >> 26;
841 h8 -= shl64(carry8, 26);
860 |f| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
863 |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
866 void fe_neg(fe h, const fe f) {
901 void fe_pow22523(fe out, const fe z) {
908 for(i = 1; i < 1; ++i) {
914 for(i = 1; i < 2; ++i) {
922 for(i = 1; i < 1; ++i) {
929 for(i = 1; i < 5; ++i) {
936 for(i = 1; i < 10; ++i) {
943 for(i = 1; i < 20; ++i) {
950 for(i = 1; i < 10; ++i) {
957 for(i = 1; i < 50; ++i) {
964 for(i = 1; i < 100; ++i) {
971 for(i = 1; i < 50; ++i) {
978 for(i = 1; i < 2; ++i) {
989 Can overlap h with f.
992 |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
995 |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
999 See fe_mul.c for discussion of implementation strategy.
1002 void fe_sq(fe h, const fe f) {
1013 int32_t f0_2 = 2 * f0;
1014 int32_t f1_2 = 2 * f1;
1015 int32_t f2_2 = 2 * f2;
1016 int32_t f3_2 = 2 * f3;
1017 int32_t f4_2 = 2 * f4;
1018 int32_t f5_2 = 2 * f5;
1019 int32_t f6_2 = 2 * f6;
1020 int32_t f7_2 = 2 * f7;
1021 int32_t f5_38 = 38 * f5; /* 1.959375*2^30 */
1022 int32_t f6_19 = 19 * f6; /* 1.959375*2^30 */
1023 int32_t f7_38 = 38 * f7; /* 1.959375*2^30 */
1024 int32_t f8_19 = 19 * f8; /* 1.959375*2^30 */
1025 int32_t f9_38 = 38 * f9; /* 1.959375*2^30 */
1026 int64_t f0f0 = f0 * (int64_t) f0;
1027 int64_t f0f1_2 = f0_2 * (int64_t) f1;
1028 int64_t f0f2_2 = f0_2 * (int64_t) f2;
1029 int64_t f0f3_2 = f0_2 * (int64_t) f3;
1030 int64_t f0f4_2 = f0_2 * (int64_t) f4;
1031 int64_t f0f5_2 = f0_2 * (int64_t) f5;
1032 int64_t f0f6_2 = f0_2 * (int64_t) f6;
1033 int64_t f0f7_2 = f0_2 * (int64_t) f7;
1034 int64_t f0f8_2 = f0_2 * (int64_t) f8;
1035 int64_t f0f9_2 = f0_2 * (int64_t) f9;
1036 int64_t f1f1_2 = f1_2 * (int64_t) f1;
1037 int64_t f1f2_2 = f1_2 * (int64_t) f2;
1038 int64_t f1f3_4 = f1_2 * (int64_t) f3_2;
1039 int64_t f1f4_2 = f1_2 * (int64_t) f4;
1040 int64_t f1f5_4 = f1_2 * (int64_t) f5_2;
1041 int64_t f1f6_2 = f1_2 * (int64_t) f6;
1042 int64_t f1f7_4 = f1_2 * (int64_t) f7_2;
1043 int64_t f1f8_2 = f1_2 * (int64_t) f8;
1044 int64_t f1f9_76 = f1_2 * (int64_t) f9_38;
1045 int64_t f2f2 = f2 * (int64_t) f2;
1046 int64_t f2f3_2 = f2_2 * (int64_t) f3;
1047 int64_t f2f4_2 = f2_2 * (int64_t) f4;
1048 int64_t f2f5_2 = f2_2 * (int64_t) f5;
1049 int64_t f2f6_2 = f2_2 * (int64_t) f6;
1050 int64_t f2f7_2 = f2_2 * (int64_t) f7;
1051 int64_t f2f8_38 = f2_2 * (int64_t) f8_19;
1052 int64_t f2f9_38 = f2 * (int64_t) f9_38;
1053 int64_t f3f3_2 = f3_2 * (int64_t) f3;
1054 int64_t f3f4_2 = f3_2 * (int64_t) f4;
1055 int64_t f3f5_4 = f3_2 * (int64_t) f5_2;
1056 int64_t f3f6_2 = f3_2 * (int64_t) f6;
1057 int64_t f3f7_76 = f3_2 * (int64_t) f7_38;
1058 int64_t f3f8_38 = f3_2 * (int64_t) f8_19;
1059 int64_t f3f9_76 = f3_2 * (int64_t) f9_38;
1060 int64_t f4f4 = f4 * (int64_t) f4;
1061 int64_t f4f5_2 = f4_2 * (int64_t) f5;
1062 int64_t f4f6_38 = f4_2 * (int64_t) f6_19;
1063 int64_t f4f7_38 = f4 * (int64_t) f7_38;
1064 int64_t f4f8_38 = f4_2 * (int64_t) f8_19;
1065 int64_t f4f9_38 = f4 * (int64_t) f9_38;
1066 int64_t f5f5_38 = f5 * (int64_t) f5_38;
1067 int64_t f5f6_38 = f5_2 * (int64_t) f6_19;
1068 int64_t f5f7_76 = f5_2 * (int64_t) f7_38;
1069 int64_t f5f8_38 = f5_2 * (int64_t) f8_19;
1070 int64_t f5f9_76 = f5_2 * (int64_t) f9_38;
1071 int64_t f6f6_19 = f6 * (int64_t) f6_19;
1072 int64_t f6f7_38 = f6 * (int64_t) f7_38;
1073 int64_t f6f8_38 = f6_2 * (int64_t) f8_19;
1074 int64_t f6f9_38 = f6 * (int64_t) f9_38;
1075 int64_t f7f7_38 = f7 * (int64_t) f7_38;
1076 int64_t f7f8_38 = f7_2 * (int64_t) f8_19;
1077 int64_t f7f9_76 = f7_2 * (int64_t) f9_38;
1078 int64_t f8f8_19 = f8 * (int64_t) f8_19;
1079 int64_t f8f9_38 = f8 * (int64_t) f9_38;
1080 int64_t f9f9_38 = f9 * (int64_t) f9_38;
1081 int64_t h0 = f0f0 + f1f9_76 + f2f8_38 + f3f7_76 + f4f6_38 + f5f5_38;
1082 int64_t h1 = f0f1_2 + f2f9_38 + f3f8_38 + f4f7_38 + f5f6_38;
1083 int64_t h2 = f0f2_2 + f1f1_2 + f3f9_76 + f4f8_38 + f5f7_76 + f6f6_19;
1084 int64_t h3 = f0f3_2 + f1f2_2 + f4f9_38 + f5f8_38 + f6f7_38;
1085 int64_t h4 = f0f4_2 + f1f3_4 + f2f2 + f5f9_76 + f6f8_38 + f7f7_38;
1086 int64_t h5 = f0f5_2 + f1f4_2 + f2f3_2 + f6f9_38 + f7f8_38;
1087 int64_t h6 = f0f6_2 + f1f5_4 + f2f4_2 + f3f3_2 + f7f9_76 + f8f8_19;
1088 int64_t h7 = f0f7_2 + f1f6_2 + f2f5_2 + f3f4_2 + f8f9_38;
1089 int64_t h8 = f0f8_2 + f1f7_4 + f2f6_2 + f3f5_4 + f4f4 + f9f9_38;
1090 int64_t h9 = f0f9_2 + f1f8_2 + f2f7_2 + f3f6_2 + f4f5_2;
1101 carry0 = (h0 + (1L << 25)) >> 26;
1103 h0 -= shl64(carry0, 26);
1104 carry4 = (h4 + (1L << 25)) >> 26;
1106 h4 -= shl64(carry4, 26);
1107 carry1 = (h1 + (1L << 24)) >> 25;
1109 h1 -= shl64(carry1, 25);
1110 carry5 = (h5 + (1L << 24)) >> 25;
1112 h5 -= shl64(carry5, 25);
1113 carry2 = (h2 + (1L << 25)) >> 26;
1115 h2 -= shl64(carry2, 26);
1116 carry6 = (h6 + (1L << 25)) >> 26;
1118 h6 -= shl64(carry6, 26);
1119 carry3 = (h3 + (1L << 24)) >> 25;
1121 h3 -= shl64(carry3, 25);
1122 carry7 = (h7 + (1L << 24)) >> 25;
1124 h7 -= shl64(carry7, 25);
1125 carry4 = (h4 + (1L << 25)) >> 26;
1127 h4 -= shl64(carry4, 26);
1128 carry8 = (h8 + (1L << 25)) >> 26;
1130 h8 -= shl64(carry8, 26);
1131 carry9 = (h9 + (1L << 24)) >> 25;
1133 h9 -= shl64(carry9, 25);
1134 carry0 = (h0 + (1L << 25)) >> 26;
1136 h0 -= shl64(carry0, 26);
1137 h[0] = (int32_t) h0;
1138 h[1] = (int32_t) h1;
1139 h[2] = (int32_t) h2;
1140 h[3] = (int32_t) h3;
1141 h[4] = (int32_t) h4;
1142 h[5] = (int32_t) h5;
1143 h[6] = (int32_t) h6;
1144 h[7] = (int32_t) h7;
1145 h[8] = (int32_t) h8;
1146 h[9] = (int32_t) h9;
1152 Can overlap h with f.
1155 |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
1158 |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
1162 See fe_mul.c for discussion of implementation strategy.
1165 void fe_sq2(fe h, const fe f) {
1176 int32_t f0_2 = 2 * f0;
1177 int32_t f1_2 = 2 * f1;
1178 int32_t f2_2 = 2 * f2;
1179 int32_t f3_2 = 2 * f3;
1180 int32_t f4_2 = 2 * f4;
1181 int32_t f5_2 = 2 * f5;
1182 int32_t f6_2 = 2 * f6;
1183 int32_t f7_2 = 2 * f7;
1184 int32_t f5_38 = 38 * f5; /* 1.959375*2^30 */
1185 int32_t f6_19 = 19 * f6; /* 1.959375*2^30 */
1186 int32_t f7_38 = 38 * f7; /* 1.959375*2^30 */
1187 int32_t f8_19 = 19 * f8; /* 1.959375*2^30 */
1188 int32_t f9_38 = 38 * f9; /* 1.959375*2^30 */
1189 int64_t f0f0 = f0 * (int64_t) f0;
1190 int64_t f0f1_2 = f0_2 * (int64_t) f1;
1191 int64_t f0f2_2 = f0_2 * (int64_t) f2;
1192 int64_t f0f3_2 = f0_2 * (int64_t) f3;
1193 int64_t f0f4_2 = f0_2 * (int64_t) f4;
1194 int64_t f0f5_2 = f0_2 * (int64_t) f5;
1195 int64_t f0f6_2 = f0_2 * (int64_t) f6;
1196 int64_t f0f7_2 = f0_2 * (int64_t) f7;
1197 int64_t f0f8_2 = f0_2 * (int64_t) f8;
1198 int64_t f0f9_2 = f0_2 * (int64_t) f9;
1199 int64_t f1f1_2 = f1_2 * (int64_t) f1;
1200 int64_t f1f2_2 = f1_2 * (int64_t) f2;
1201 int64_t f1f3_4 = f1_2 * (int64_t) f3_2;
1202 int64_t f1f4_2 = f1_2 * (int64_t) f4;
1203 int64_t f1f5_4 = f1_2 * (int64_t) f5_2;
1204 int64_t f1f6_2 = f1_2 * (int64_t) f6;
1205 int64_t f1f7_4 = f1_2 * (int64_t) f7_2;
1206 int64_t f1f8_2 = f1_2 * (int64_t) f8;
1207 int64_t f1f9_76 = f1_2 * (int64_t) f9_38;
1208 int64_t f2f2 = f2 * (int64_t) f2;
1209 int64_t f2f3_2 = f2_2 * (int64_t) f3;
1210 int64_t f2f4_2 = f2_2 * (int64_t) f4;
1211 int64_t f2f5_2 = f2_2 * (int64_t) f5;
1212 int64_t f2f6_2 = f2_2 * (int64_t) f6;
1213 int64_t f2f7_2 = f2_2 * (int64_t) f7;
1214 int64_t f2f8_38 = f2_2 * (int64_t) f8_19;
1215 int64_t f2f9_38 = f2 * (int64_t) f9_38;
1216 int64_t f3f3_2 = f3_2 * (int64_t) f3;
1217 int64_t f3f4_2 = f3_2 * (int64_t) f4;
1218 int64_t f3f5_4 = f3_2 * (int64_t) f5_2;
1219 int64_t f3f6_2 = f3_2 * (int64_t) f6;
1220 int64_t f3f7_76 = f3_2 * (int64_t) f7_38;
1221 int64_t f3f8_38 = f3_2 * (int64_t) f8_19;
1222 int64_t f3f9_76 = f3_2 * (int64_t) f9_38;
1223 int64_t f4f4 = f4 * (int64_t) f4;
1224 int64_t f4f5_2 = f4_2 * (int64_t) f5;
1225 int64_t f4f6_38 = f4_2 * (int64_t) f6_19;
1226 int64_t f4f7_38 = f4 * (int64_t) f7_38;
1227 int64_t f4f8_38 = f4_2 * (int64_t) f8_19;
1228 int64_t f4f9_38 = f4 * (int64_t) f9_38;
1229 int64_t f5f5_38 = f5 * (int64_t) f5_38;
1230 int64_t f5f6_38 = f5_2 * (int64_t) f6_19;
1231 int64_t f5f7_76 = f5_2 * (int64_t) f7_38;
1232 int64_t f5f8_38 = f5_2 * (int64_t) f8_19;
1233 int64_t f5f9_76 = f5_2 * (int64_t) f9_38;
1234 int64_t f6f6_19 = f6 * (int64_t) f6_19;
1235 int64_t f6f7_38 = f6 * (int64_t) f7_38;
1236 int64_t f6f8_38 = f6_2 * (int64_t) f8_19;
1237 int64_t f6f9_38 = f6 * (int64_t) f9_38;
1238 int64_t f7f7_38 = f7 * (int64_t) f7_38;
1239 int64_t f7f8_38 = f7_2 * (int64_t) f8_19;
1240 int64_t f7f9_76 = f7_2 * (int64_t) f9_38;
1241 int64_t f8f8_19 = f8 * (int64_t) f8_19;
1242 int64_t f8f9_38 = f8 * (int64_t) f9_38;
1243 int64_t f9f9_38 = f9 * (int64_t) f9_38;
1244 int64_t h0 = f0f0 + f1f9_76 + f2f8_38 + f3f7_76 + f4f6_38 + f5f5_38;
1245 int64_t h1 = f0f1_2 + f2f9_38 + f3f8_38 + f4f7_38 + f5f6_38;
1246 int64_t h2 = f0f2_2 + f1f1_2 + f3f9_76 + f4f8_38 + f5f7_76 + f6f6_19;
1247 int64_t h3 = f0f3_2 + f1f2_2 + f4f9_38 + f5f8_38 + f6f7_38;
1248 int64_t h4 = f0f4_2 + f1f3_4 + f2f2 + f5f9_76 + f6f8_38 + f7f7_38;
1249 int64_t h5 = f0f5_2 + f1f4_2 + f2f3_2 + f6f9_38 + f7f8_38;
1250 int64_t h6 = f0f6_2 + f1f5_4 + f2f4_2 + f3f3_2 + f7f9_76 + f8f8_19;
1251 int64_t h7 = f0f7_2 + f1f6_2 + f2f5_2 + f3f4_2 + f8f9_38;
1252 int64_t h8 = f0f8_2 + f1f7_4 + f2f6_2 + f3f5_4 + f4f4 + f9f9_38;
1253 int64_t h9 = f0f9_2 + f1f8_2 + f2f7_2 + f3f6_2 + f4f5_2;
1274 carry0 = (h0 + (1L << 25)) >> 26;
1276 h0 -= shl64(carry0, 26);
1277 carry4 = (h4 + (1L << 25)) >> 26;
1279 h4 -= shl64(carry4, 26);
1280 carry1 = (h1 + (1L << 24)) >> 25;
1282 h1 -= shl64(carry1, 25);
1283 carry5 = (h5 + (1L << 24)) >> 25;
1285 h5 -= shl64(carry5, 25);
1286 carry2 = (h2 + (1L << 25)) >> 26;
1288 h2 -= shl64(carry2, 26);
1289 carry6 = (h6 + (1L << 25)) >> 26;
1291 h6 -= shl64(carry6, 26);
1292 carry3 = (h3 + (1L << 24)) >> 25;
1294 h3 -= shl64(carry3, 25);
1295 carry7 = (h7 + (1L << 24)) >> 25;
1297 h7 -= shl64(carry7, 25);
1298 carry4 = (h4 + (1L << 25)) >> 26;
1300 h4 -= shl64(carry4, 26);
1301 carry8 = (h8 + (1L << 25)) >> 26;
1303 h8 -= shl64(carry8, 26);
1304 carry9 = (h9 + (1L << 24)) >> 25;
1306 h9 -= shl64(carry9, 25);
1307 carry0 = (h0 + (1L << 25)) >> 26;
1309 h0 -= shl64(carry0, 26);
1310 h[0] = (int32_t) h0;
1311 h[1] = (int32_t) h1;
1312 h[2] = (int32_t) h2;
1313 h[3] = (int32_t) h3;
1314 h[4] = (int32_t) h4;
1315 h[5] = (int32_t) h5;
1316 h[6] = (int32_t) h6;
1317 h[7] = (int32_t) h7;
1318 h[8] = (int32_t) h8;
1319 h[9] = (int32_t) h9;
1325 Can overlap h with f or g.
1328 |f| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
1329 |g| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
1332 |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
1335 void fe_sub(fe h, const fe f, const fe g) {
1356 int32_t h0 = f0 - g0;
1357 int32_t h1 = f1 - g1;
1358 int32_t h2 = f2 - g2;
1359 int32_t h3 = f3 - g3;
1360 int32_t h4 = f4 - g4;
1361 int32_t h5 = f5 - g5;
1362 int32_t h6 = f6 - g6;
1363 int32_t h7 = f7 - g7;
1364 int32_t h8 = f8 - g8;
1365 int32_t h9 = f9 - g9;
1383 |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
1385 Write p=2^255-19; q=floor(h/p).
1386 Basic claim: q = floor(2^(-255)(h + 19 2^(-25)h9 + 2^(-1))).
1389 Have |h|<=p so |q|<=1 so |19^2 2^(-255) q|<1/4.
1390 Also have |h-2^230 h9|<2^231 so |19 2^(-255)(h-2^230 h9)|<1/4.
1392 Write y=2^(-1)-19^2 2^(-255)q-19 2^(-255)(h-2^230 h9).
1396 Have 0<=r<=p-1=2^255-20.
1397 Thus 0<=r+19(2^-255)r<r+19(2^-255)2^255<=2^255-1.
1399 Write x=r+19(2^-255)r+y.
1400 Then 0<x<2^255 so floor(2^(-255)x) = 0 so floor(q+2^(-255)x) = q.
1402 Have q+2^(-255)x = 2^(-255)(h + 19 2^(-25) h9 + 2^(-1))
1403 so floor(2^(-255)(h + 19 2^(-25) h9 + 2^(-1))) = q.
1406 void fe_tobytes(unsigned char *s, const fe h) {
1428 q = (19 * h9 + (((int32_t) 1) << 24)) >> 25;
1439 /* Goal: Output h-(2^255-19)q, which is between 0 and 2^255-20. */
1441 /* Goal: Output h-2^255 q, which is between 0 and 2^255-20. */
1444 h0 -= shl32(carry0, 26);
1447 h1 -= shl32(carry1, 25);
1450 h2 -= shl32(carry2, 26);
1453 h3 -= shl32(carry3, 25);
1456 h4 -= shl32(carry4, 26);
1459 h5 -= shl32(carry5, 25);
1462 h6 -= shl32(carry6, 26);
1465 h7 -= shl32(carry7, 25);
1468 h8 -= shl32(carry8, 26);
1470 h9 -= shl32(carry9, 25);
1474 Goal: Output h0+...+2^255 h10-2^255 q, which is between 0 and 2^255-20.
1475 Have h0+...+2^230 h9 between 0 and 2^255-1;
1476 evidently 2^255 h10-2^255 q = 0.
1477 Goal: Output h0+...+2^230 h9.
1479 s[0] = (unsigned char)(h0 >> 0);
1480 s[1] = (unsigned char)(h0 >> 8);
1481 s[2] = (unsigned char)(h0 >> 16);
1482 s[3] = (unsigned char)((h0 >> 24) | shl32(h1, 2));
1483 s[4] = (unsigned char)(h1 >> 6);
1484 s[5] = (unsigned char)(h1 >> 14);
1485 s[6] = (unsigned char)((h1 >> 22) | shl32(h2, 3));
1486 s[7] = (unsigned char)(h2 >> 5);
1487 s[8] = (unsigned char)(h2 >> 13);
1488 s[9] = (unsigned char)((h2 >> 21) | shl32(h3, 5));
1489 s[10] = (unsigned char)(h3 >> 3);
1490 s[11] = (unsigned char)(h3 >> 11);
1491 s[12] = (unsigned char)((h3 >> 19) | shl32(h4, 6));
1492 s[13] = (unsigned char)(h4 >> 2);
1493 s[14] = (unsigned char)(h4 >> 10);
1494 s[15] = (unsigned char)(h4 >> 18);
1495 s[16] = (unsigned char)(h5 >> 0);
1496 s[17] = (unsigned char)(h5 >> 8);
1497 s[18] = (unsigned char)(h5 >> 16);
1498 s[19] = (unsigned char)((h5 >> 24) | shl32(h6, 1));
1499 s[20] = (unsigned char)(h6 >> 7);
1500 s[21] = (unsigned char)(h6 >> 15);
1501 s[22] = (unsigned char)((h6 >> 23) | shl32(h7, 3));
1502 s[23] = (unsigned char)(h7 >> 5);
1503 s[24] = (unsigned char)(h7 >> 13);
1504 s[25] = (unsigned char)((h7 >> 21) | shl32(h8, 4));
1505 s[26] = (unsigned char)(h8 >> 4);
1506 s[27] = (unsigned char)(h8 >> 12);
1507 s[28] = (unsigned char)((h8 >> 20) | shl32(h9, 6));
1508 s[29] = (unsigned char)(h9 >> 2);
1509 s[30] = (unsigned char)(h9 >> 10);
1510 s[31] = (unsigned char)(h9 >> 18);