- /* init state */
- h0 = 0;
- h1 = 0;
- h2 = 0;
- h3 = 0;
- h4 = 0;
-
- /* full blocks */
- if (inlen < 16)
- goto poly1305_donna_atmost15bytes;
-
- poly1305_donna_16bytes:
- m += 16;
- inlen -= 16;
-
- t0 = U8TO32_LE(m - 16);
- t1 = U8TO32_LE(m - 12);
- t2 = U8TO32_LE(m - 8);
- t3 = U8TO32_LE(m - 4);
-
- h0 += t0 & 0x3ffffff;
- h1 += ((((uint64_t) t1 << 32) | t0) >> 26) & 0x3ffffff;
- h2 += ((((uint64_t) t2 << 32) | t1) >> 20) & 0x3ffffff;
- h3 += ((((uint64_t) t3 << 32) | t2) >> 14) & 0x3ffffff;
- h4 += (t3 >> 8) | (1 << 24);
-
- poly1305_donna_mul:
- t[0] = mul32x32_64(h0, r0) + mul32x32_64(h1, s4) + mul32x32_64(h2, s3) + mul32x32_64(h3, s2) + mul32x32_64(h4, s1);
- t[1] = mul32x32_64(h0, r1) + mul32x32_64(h1, r0) + mul32x32_64(h2, s4) + mul32x32_64(h3, s3) + mul32x32_64(h4, s2);
- t[2] = mul32x32_64(h0, r2) + mul32x32_64(h1, r1) + mul32x32_64(h2, r0) + mul32x32_64(h3, s4) + mul32x32_64(h4, s3);
- t[3] = mul32x32_64(h0, r3) + mul32x32_64(h1, r2) + mul32x32_64(h2, r1) + mul32x32_64(h3, r0) + mul32x32_64(h4, s4);
- t[4] = mul32x32_64(h0, r4) + mul32x32_64(h1, r3) + mul32x32_64(h2, r2) + mul32x32_64(h3, r1) + mul32x32_64(h4, r0);
-
- h0 = (uint32_t) t[0] & 0x3ffffff;
- c = (t[0] >> 26);
- t[1] += c;
- h1 = (uint32_t) t[1] & 0x3ffffff;
- b = (uint32_t) (t[1] >> 26);
- t[2] += b;
- h2 = (uint32_t) t[2] & 0x3ffffff;
- b = (uint32_t) (t[2] >> 26);
- t[3] += b;
- h3 = (uint32_t) t[3] & 0x3ffffff;
- b = (uint32_t) (t[3] >> 26);
- t[4] += b;
- h4 = (uint32_t) t[4] & 0x3ffffff;
- b = (uint32_t) (t[4] >> 26);
- h0 += b * 5;
-
- if (inlen >= 16)
- goto poly1305_donna_16bytes;
-
- /* final bytes */
- poly1305_donna_atmost15bytes:
- if (!inlen)
- goto poly1305_donna_finish;
-
- for (j = 0; j < inlen; j++)
- mp[j] = m[j];
- mp[j++] = 1;
- for (; j < 16; j++)
- mp[j] = 0;
- inlen = 0;
-
- t0 = U8TO32_LE(mp + 0);
- t1 = U8TO32_LE(mp + 4);
- t2 = U8TO32_LE(mp + 8);
- t3 = U8TO32_LE(mp + 12);
-
- h0 += t0 & 0x3ffffff;
- h1 += ((((uint64_t) t1 << 32) | t0) >> 26) & 0x3ffffff;
- h2 += ((((uint64_t) t2 << 32) | t1) >> 20) & 0x3ffffff;
- h3 += ((((uint64_t) t3 << 32) | t2) >> 14) & 0x3ffffff;
- h4 += (t3 >> 8);
-
- goto poly1305_donna_mul;
-
- poly1305_donna_finish:
- b = h0 >> 26;
- h0 = h0 & 0x3ffffff;
- h1 += b;
- b = h1 >> 26;
+ h0 = st->h[0];
+ h1 = st->h[1];
+ h2 = st->h[2];
+ h3 = st->h[3];
+ h4 = st->h[4];
+
+ while(bytes >= POLY1305_BLOCK_SIZE) {
+ /* h += m[i] */
+ h0 += (U8TO32(m + 0)) & 0x3ffffff;
+ h1 += (U8TO32(m + 3) >> 2) & 0x3ffffff;
+ h2 += (U8TO32(m + 6) >> 4) & 0x3ffffff;
+ h3 += (U8TO32(m + 9) >> 6) & 0x3ffffff;
+ h4 += (U8TO32(m + 12) >> 8) | hibit;
+
+ /* h *= r */
+ d0 = ((uint64_t)h0 * r0) + ((uint64_t)h1 * s4) + ((uint64_t)h2 * s3) + ((uint64_t)h3 * s2) + ((uint64_t)h4 * s1);
+ d1 = ((uint64_t)h0 * r1) + ((uint64_t)h1 * r0) + ((uint64_t)h2 * s4) + ((uint64_t)h3 * s3) + ((uint64_t)h4 * s2);
+ d2 = ((uint64_t)h0 * r2) + ((uint64_t)h1 * r1) + ((uint64_t)h2 * r0) + ((uint64_t)h3 * s4) + ((uint64_t)h4 * s3);
+ d3 = ((uint64_t)h0 * r3) + ((uint64_t)h1 * r2) + ((uint64_t)h2 * r1) + ((uint64_t)h3 * r0) + ((uint64_t)h4 * s4);
+ d4 = ((uint64_t)h0 * r4) + ((uint64_t)h1 * r3) + ((uint64_t)h2 * r2) + ((uint64_t)h3 * r1) + ((uint64_t)h4 * r0);
+
+ /* (partial) h %= p */
+ c = (uint32_t)(d0 >> 26);
+ h0 = (uint32_t)d0 & 0x3ffffff;
+ d1 += c;
+ c = (uint32_t)(d1 >> 26);
+ h1 = (uint32_t)d1 & 0x3ffffff;
+ d2 += c;
+ c = (uint32_t)(d2 >> 26);
+ h2 = (uint32_t)d2 & 0x3ffffff;
+ d3 += c;
+ c = (uint32_t)(d3 >> 26);
+ h3 = (uint32_t)d3 & 0x3ffffff;
+ d4 += c;
+ c = (uint32_t)(d4 >> 26);
+ h4 = (uint32_t)d4 & 0x3ffffff;
+ h0 += c * 5;
+ c = (h0 >> 26);
+ h0 = h0 & 0x3ffffff;
+ h1 += c;
+
+ m += POLY1305_BLOCK_SIZE;
+ bytes -= POLY1305_BLOCK_SIZE;
+ }
+
+ st->h[0] = h0;
+ st->h[1] = h1;
+ st->h[2] = h2;
+ st->h[3] = h3;
+ st->h[4] = h4;
+}
+
+void
+poly1305_finish(struct poly1305_context *st, unsigned char mac[16]) {
+ uint32_t h0, h1, h2, h3, h4, c;
+ uint32_t g0, g1, g2, g3, g4;
+ uint64_t f;
+ uint32_t mask;
+
+ /* process the remaining block */
+ if(st->leftover) {
+ size_t i = st->leftover;
+ st->buffer[i++] = 1;
+
+ for(; i < POLY1305_BLOCK_SIZE; i++) {
+ st->buffer[i] = 0;
+ }
+
+ st->final = 1;
+ poly1305_blocks(st, st->buffer, POLY1305_BLOCK_SIZE);
+ }
+
+ /* fully carry h */
+ h0 = st->h[0];
+ h1 = st->h[1];
+ h2 = st->h[2];
+ h3 = st->h[3];
+ h4 = st->h[4];
+
+ c = h1 >> 26;