Blame - libmincrypt/p256_ec.c - android_system_core

blob: 90262cc907b611c171f3cddff98e852ad0dc39e2 [file] [log] [blame]

Kenny Root	db0850c	2013-10-08 12:52:07 -0700	[diff] [blame]	1	/*
				2	* Copyright 2013 The Android Open Source Project
				3	*
				4	* Redistribution and use in source and binary forms, with or without
				5	* modification, are permitted provided that the following conditions are met:
				6	* * Redistributions of source code must retain the above copyright
				7	* notice, this list of conditions and the following disclaimer.
				8	* * Redistributions in binary form must reproduce the above copyright
				9	* notice, this list of conditions and the following disclaimer in the
				10	* documentation and/or other materials provided with the distribution.
				11	* * Neither the name of Google Inc. nor the names of its contributors may
				12	* be used to endorse or promote products derived from this software
				13	* without specific prior written permission.
				14	*
				15	* THIS SOFTWARE IS PROVIDED BY Google Inc. ``AS IS'' AND ANY EXPRESS OR
				16	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
				17	* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
				18	* EVENT SHALL Google Inc. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				19	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
				20	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
				21	* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
				22	* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
				23	* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
				24	* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				25	*/
				26
				27	// This is an implementation of the P256 elliptic curve group. It's written to
				28	// be portable 32-bit, although it's still constant-time.
				29	//
				30	// WARNING: Implementing these functions in a constant-time manner is far from
				31	// obvious. Be careful when touching this code.
				32	//
				33	// See http://www.imperialviolet.org/2010/12/04/ecc.html ([1]) for background.
				34
				35	#include <stdint.h>
				36	#include <stdio.h>
				37
				38	#include <string.h>
				39	#include <stdlib.h>
				40
				41	#include "mincrypt/p256.h"
				42
				43	typedef uint8_t u8;
				44	typedef uint32_t u32;
				45	typedef int32_t s32;
				46	typedef uint64_t u64;
				47
				48	/* Our field elements are represented as nine 32-bit limbs.
				49	*
				50	* The value of an felem (field element) is:
				51	* x[0] + (x[1] * 2*29) + (x[2] 2*57) + ... + (x[8] 2**228)
				52	*
				53	* That is, each limb is alternately 29 or 28-bits wide in little-endian
				54	* order.
				55	*
				56	* This means that an felem hits 2257, rather than 2256 as we would like. A
				57	* 28, 29, ... pattern would cause us to hit 2**256, but that causes problems
				58	* when multiplying as terms end up one bit short of a limb which would require
				59	* much bit-shifting to correct.
				60	*
				61	* Finally, the values stored in an felem are in Montgomery form. So the value
				62	* \|y\| is stored as (yR) mod p, where p is the P-256 prime and R is 2*257.
				63	*/
				64	typedef u32 limb;
				65	#define NLIMBS 9
				66	typedef limb felem[NLIMBS];
				67
				68	static const limb kBottom28Bits = 0xfffffff;
				69	static const limb kBottom29Bits = 0x1fffffff;
				70
				71	/* kOne is the number 1 as an felem. It's 2**257 mod p split up into 29 and
				72	* 28-bit words. */
				73	static const felem kOne = {
				74	2, 0, 0, 0xffff800,
				75	0x1fffffff, 0xfffffff, 0x1fbfffff, 0x1ffffff,
				76	0
				77	};
				78	static const felem kZero = {0};
				79	static const felem kP = {
				80	0x1fffffff, 0xfffffff, 0x1fffffff, 0x3ff,
				81	0, 0, 0x200000, 0xf000000,
				82	0xfffffff
				83	};
				84	static const felem k2P = {
				85	0x1ffffffe, 0xfffffff, 0x1fffffff, 0x7ff,
				86	0, 0, 0x400000, 0xe000000,
				87	0x1fffffff
				88	};
				89	/* kPrecomputed contains precomputed values to aid the calculation of scalar
				90	* multiples of the base point, G. It's actually two, equal length, tables
				91	* concatenated.
				92	*
				93	* The first table contains (x,y) felem pairs for 16 multiples of the base
				94	* point, G.
				95	*
				96	* Index \| Index (binary) \| Value
				97	* 0 \| 0000 \| 0G (all zeros, omitted)
				98	* 1 \| 0001 \| G
				99	* 2 \| 0010 \| 2**64G
				100	* 3 \| 0011 \| 2**64G + G
				101	* 4 \| 0100 \| 2**128G
				102	* 5 \| 0101 \| 2**128G + G
				103	* 6 \| 0110 \| 2128G + 264G
				104	* 7 \| 0111 \| 2128G + 264G + G
				105	* 8 \| 1000 \| 2**192G
				106	* 9 \| 1001 \| 2**192G + G
				107	* 10 \| 1010 \| 2192G + 264G
				108	* 11 \| 1011 \| 2192G + 264G + G
				109	* 12 \| 1100 \| 2192G + 2128G
				110	* 13 \| 1101 \| 2192G + 2128G + G
				111	* 14 \| 1110 \| 2192G + 2128G + 2**64G
				112	* 15 \| 1111 \| 2192G + 2128G + 2**64G + G
				113	*
				114	* The second table follows the same style, but the terms are 2**32G,
				115	* 296G, 2160G, 2**224G.
				116	*
				117	* This is ~2KB of data. */
				118	static const limb kPrecomputed[NLIMBS * 2 * 15 * 2] = {
				119	0x11522878, 0xe730d41, 0xdb60179, 0x4afe2ff, 0x12883add, 0xcaddd88, 0x119e7edc, 0xd4a6eab, 0x3120bee,
				120	0x1d2aac15, 0xf25357c, 0x19e45cdd, 0x5c721d0, 0x1992c5a5, 0xa237487, 0x154ba21, 0x14b10bb, 0xae3fe3,
				121	0xd41a576, 0x922fc51, 0x234994f, 0x60b60d3, 0x164586ae, 0xce95f18, 0x1fe49073, 0x3fa36cc, 0x5ebcd2c,
				122	0xb402f2f, 0x15c70bf, 0x1561925c, 0x5a26704, 0xda91e90, 0xcdc1c7f, 0x1ea12446, 0xe1ade1e, 0xec91f22,
				123	0x26f7778, 0x566847e, 0xa0bec9e, 0x234f453, 0x1a31f21a, 0xd85e75c, 0x56c7109, 0xa267a00, 0xb57c050,
				124	0x98fb57, 0xaa837cc, 0x60c0792, 0xcfa5e19, 0x61bab9e, 0x589e39b, 0xa324c5, 0x7d6dee7, 0x2976e4b,
				125	0x1fc4124a, 0xa8c244b, 0x1ce86762, 0xcd61c7e, 0x1831c8e0, 0x75774e1, 0x1d96a5a9, 0x843a649, 0xc3ab0fa,
				126	0x6e2e7d5, 0x7673a2a, 0x178b65e8, 0x4003e9b, 0x1a1f11c2, 0x7816ea, 0xf643e11, 0x58c43df, 0xf423fc2,
				127	0x19633ffa, 0x891f2b2, 0x123c231c, 0x46add8c, 0x54700dd, 0x59e2b17, 0x172db40f, 0x83e277d, 0xb0dd609,
				128	0xfd1da12, 0x35c6e52, 0x19ede20c, 0xd19e0c0, 0x97d0f40, 0xb015b19, 0x449e3f5, 0xe10c9e, 0x33ab581,
				129	0x56a67ab, 0x577734d, 0x1dddc062, 0xc57b10d, 0x149b39d, 0x26a9e7b, 0xc35df9f, 0x48764cd, 0x76dbcca,
				130	0xca4b366, 0xe9303ab, 0x1a7480e7, 0x57e9e81, 0x1e13eb50, 0xf466cf3, 0x6f16b20, 0x4ba3173, 0xc168c33,
				131	0x15cb5439, 0x6a38e11, 0x73658bd, 0xb29564f, 0x3f6dc5b, 0x53b97e, 0x1322c4c0, 0x65dd7ff, 0x3a1e4f6,
				132	0x14e614aa, 0x9246317, 0x1bc83aca, 0xad97eed, 0xd38ce4a, 0xf82b006, 0x341f077, 0xa6add89, 0x4894acd,
				133	0x9f162d5, 0xf8410ef, 0x1b266a56, 0xd7f223, 0x3e0cb92, 0xe39b672, 0x6a2901a, 0x69a8556, 0x7e7c0,
				134	0x9b7d8d3, 0x309a80, 0x1ad05f7f, 0xc2fb5dd, 0xcbfd41d, 0x9ceb638, 0x1051825c, 0xda0cf5b, 0x812e881,
				135	0x6f35669, 0x6a56f2c, 0x1df8d184, 0x345820, 0x1477d477, 0x1645db1, 0xbe80c51, 0xc22be3e, 0xe35e65a,
				136	0x1aeb7aa0, 0xc375315, 0xf67bc99, 0x7fdd7b9, 0x191fc1be, 0x61235d, 0x2c184e9, 0x1c5a839, 0x47a1e26,
				137	0xb7cb456, 0x93e225d, 0x14f3c6ed, 0xccc1ac9, 0x17fe37f3, 0x4988989, 0x1a90c502, 0x2f32042, 0xa17769b,
				138	0xafd8c7c, 0x8191c6e, 0x1dcdb237, 0x16200c0, 0x107b32a1, 0x66c08db, 0x10d06a02, 0x3fc93, 0x5620023,
				139	0x16722b27, 0x68b5c59, 0x270fcfc, 0xfad0ecc, 0xe5de1c2, 0xeab466b, 0x2fc513c, 0x407f75c, 0xbaab133,
				140	0x9705fe9, 0xb88b8e7, 0x734c993, 0x1e1ff8f, 0x19156970, 0xabd0f00, 0x10469ea7, 0x3293ac0, 0xcdc98aa,
				141	0x1d843fd, 0xe14bfe8, 0x15be825f, 0x8b5212, 0xeb3fb67, 0x81cbd29, 0xbc62f16, 0x2b6fcc7, 0xf5a4e29,
				142	0x13560b66, 0xc0b6ac2, 0x51ae690, 0xd41e271, 0xf3e9bd4, 0x1d70aab, 0x1029f72, 0x73e1c35, 0xee70fbc,
				143	0xad81baf, 0x9ecc49a, 0x86c741e, 0xfe6be30, 0x176752e7, 0x23d416, 0x1f83de85, 0x27de188, 0x66f70b8,
				144	0x181cd51f, 0x96b6e4c, 0x188f2335, 0xa5df759, 0x17a77eb6, 0xfeb0e73, 0x154ae914, 0x2f3ec51, 0x3826b59,
				145	0xb91f17d, 0x1c72949, 0x1362bf0a, 0xe23fddf, 0xa5614b0, 0xf7d8f, 0x79061, 0x823d9d2, 0x8213f39,
				146	0x1128ae0b, 0xd095d05, 0xb85c0c2, 0x1ecb2ef, 0x24ddc84, 0xe35e901, 0x18411a4a, 0xf5ddc3d, 0x3786689,
				147	0x52260e8, 0x5ae3564, 0x542b10d, 0x8d93a45, 0x19952aa4, 0x996cc41, 0x1051a729, 0x4be3499, 0x52b23aa,
				148	0x109f307e, 0x6f5b6bb, 0x1f84e1e7, 0x77a0cfa, 0x10c4df3f, 0x25a02ea, 0xb048035, 0xe31de66, 0xc6ecaa3,
				149	0x28ea335, 0x2886024, 0x1372f020, 0xf55d35, 0x15e4684c, 0xf2a9e17, 0x1a4a7529, 0xcb7beb1, 0xb2a78a1,
				150	0x1ab21f1f, 0x6361ccf, 0x6c9179d, 0xb135627, 0x1267b974, 0x4408bad, 0x1cbff658, 0xe3d6511, 0xc7d76f,
				151	0x1cc7a69, 0xe7ee31b, 0x54fab4f, 0x2b914f, 0x1ad27a30, 0xcd3579e, 0xc50124c, 0x50daa90, 0xb13f72,
				152	0xb06aa75, 0x70f5cc6, 0x1649e5aa, 0x84a5312, 0x329043c, 0x41c4011, 0x13d32411, 0xb04a838, 0xd760d2d,
				153	0x1713b532, 0xbaa0c03, 0x84022ab, 0x6bcf5c1, 0x2f45379, 0x18ae070, 0x18c9e11e, 0x20bca9a, 0x66f496b,
				154	0x3eef294, 0x67500d2, 0xd7f613c, 0x2dbbeb, 0xb741038, 0xe04133f, 0x1582968d, 0xbe985f7, 0x1acbc1a,
				155	0x1a6a939f, 0x33e50f6, 0xd665ed4, 0xb4b7bd6, 0x1e5a3799, 0x6b33847, 0x17fa56ff, 0x65ef930, 0x21dc4a,
				156	0x2b37659, 0x450fe17, 0xb357b65, 0xdf5efac, 0x15397bef, 0x9d35a7f, 0x112ac15f, 0x624e62e, 0xa90ae2f,
				157	0x107eecd2, 0x1f69bbe, 0x77d6bce, 0x5741394, 0x13c684fc, 0x950c910, 0x725522b, 0xdc78583, 0x40eeabb,
				158	0x1fde328a, 0xbd61d96, 0xd28c387, 0x9e77d89, 0x12550c40, 0x759cb7d, 0x367ef34, 0xae2a960, 0x91b8bdc,
				159	0x93462a9, 0xf469ef, 0xb2e9aef, 0xd2ca771, 0x54e1f42, 0x7aaa49, 0x6316abb, 0x2413c8e, 0x5425bf9,
				160	0x1bed3e3a, 0xf272274, 0x1f5e7326, 0x6416517, 0xea27072, 0x9cedea7, 0x6e7633, 0x7c91952, 0xd806dce,
				161	0x8e2a7e1, 0xe421e1a, 0x418c9e1, 0x1dbc890, 0x1b395c36, 0xa1dc175, 0x1dc4ef73, 0x8956f34, 0xe4b5cf2,
				162	0x1b0d3a18, 0x3194a36, 0x6c2641f, 0xe44124c, 0xa2f4eaa, 0xa8c25ba, 0xf927ed7, 0x627b614, 0x7371cca,
				163	0xba16694, 0x417bc03, 0x7c0a7e3, 0x9c35c19, 0x1168a205, 0x8b6b00d, 0x10e3edc9, 0x9c19bf2, 0x5882229,
				164	0x1b2b4162, 0xa5cef1a, 0x1543622b, 0x9bd433e, 0x364e04d, 0x7480792, 0x5c9b5b3, 0xe85ff25, 0x408ef57,
				165	0x1814cfa4, 0x121b41b, 0xd248a0f, 0x3b05222, 0x39bb16a, 0xc75966d, 0xa038113, 0xa4a1769, 0x11fbc6c,
				166	0x917e50e, 0xeec3da8, 0x169d6eac, 0x10c1699, 0xa416153, 0xf724912, 0x15cd60b7, 0x4acbad9, 0x5efc5fa,
				167	0xf150ed7, 0x122b51, 0x1104b40a, 0xcb7f442, 0xfbb28ff, 0x6ac53ca, 0x196142cc, 0x7bf0fa9, 0x957651,
				168	0x4e0f215, 0xed439f8, 0x3f46bd5, 0x5ace82f, 0x110916b6, 0x6db078, 0xffd7d57, 0xf2ecaac, 0xca86dec,
				169	0x15d6b2da, 0x965ecc9, 0x1c92b4c2, 0x1f3811, 0x1cb080f5, 0x2d8b804, 0x19d1c12d, 0xf20bd46, 0x1951fa7,
				170	0xa3656c3, 0x523a425, 0xfcd0692, 0xd44ddc8, 0x131f0f5b, 0xaf80e4a, 0xcd9fc74, 0x99bb618, 0x2db944c,
				171	0xa673090, 0x1c210e1, 0x178c8d23, 0x1474383, 0x10b8743d, 0x985a55b, 0x2e74779, 0x576138, 0x9587927,
				172	0x133130fa, 0xbe05516, 0x9f4d619, 0xbb62570, 0x99ec591, 0xd9468fe, 0x1d07782d, 0xfc72e0b, 0x701b298,
				173	0x1863863b, 0x85954b8, 0x121a0c36, 0x9e7fedf, 0xf64b429, 0x9b9d71e, 0x14e2f5d8, 0xf858d3a, 0x942eea8,
				174	0xda5b765, 0x6edafff, 0xa9d18cc, 0xc65e4ba, 0x1c747e86, 0xe4ea915, 0x1981d7a1, 0x8395659, 0x52ed4e2,
				175	0x87d43b7, 0x37ab11b, 0x19d292ce, 0xf8d4692, 0x18c3053f, 0x8863e13, 0x4c146c0, 0x6bdf55a, 0x4e4457d,
				176	0x16152289, 0xac78ec2, 0x1a59c5a2, 0x2028b97, 0x71c2d01, 0x295851f, 0x404747b, 0x878558d, 0x7d29aa4,
				177	0x13d8341f, 0x8daefd7, 0x139c972d, 0x6b7ea75, 0xd4a9dde, 0xff163d8, 0x81d55d7, 0xa5bef68, 0xb7b30d8,
				178	0xbe73d6f, 0xaa88141, 0xd976c81, 0x7e7a9cc, 0x18beb771, 0xd773cbd, 0x13f51951, 0x9d0c177, 0x1c49a78,
				179	};
				180
				181
				182	/* Field element operations: */
				183
				184	/* NON_ZERO_TO_ALL_ONES returns:
				185	* 0xffffffff for 0 < x <= 2**31
				186	* 0 for x == 0 or x > 2**31.
				187	*
				188	* x must be a u32 or an equivalent type such as limb. */
				189	#define NON_ZERO_TO_ALL_ONES(x) ((((u32)(x) - 1) >> 31) - 1)
				190
				191	/* felem_reduce_carry adds a multiple of p in order to cancel \|carry\|,
				192	* which is a term at 2**257.
				193	*
				194	* On entry: carry < 23, inout[0,2,...] < 229, inout[1,3,...] < 2**28.
				195	* On exit: inout[0,2,..] < 230, inout[1,3,...] < 229. */
				196	static void felem_reduce_carry(felem inout, limb carry) {
				197	const u32 carry_mask = NON_ZERO_TO_ALL_ONES(carry);
				198
				199	inout[0] += carry << 1;
				200	inout[3] += 0x10000000 & carry_mask;
				201	/* carry < 23 thus (carry << 11) < 214 and we added 2**28 in the
				202	* previous line therefore this doesn't underflow. */
				203	inout[3] -= carry << 11;
				204	inout[4] += (0x20000000 - 1) & carry_mask;
				205	inout[5] += (0x10000000 - 1) & carry_mask;
				206	inout[6] += (0x20000000 - 1) & carry_mask;
				207	inout[6] -= carry << 22;
				208	/* This may underflow if carry is non-zero but, if so, we'll fix it in the
				209	* next line. */
				210	inout[7] -= 1 & carry_mask;
				211	inout[7] += carry << 25;
				212	}
				213
				214	/* felem_sum sets out = in+in2.
				215	*
				216	* On entry, in[i]+in2[i] must not overflow a 32-bit word.
				217	* On exit: out[0,2,...] < 230, out[1,3,...] < 229 */
				218	static void felem_sum(felem out, const felem in, const felem in2) {
				219	limb carry = 0;
				220	unsigned i;
				221
				222	for (i = 0;; i++) {
				223	out[i] = in[i] + in2[i];
				224	out[i] += carry;
				225	carry = out[i] >> 29;
				226	out[i] &= kBottom29Bits;
				227
				228	i++;
				229	if (i == NLIMBS)
				230	break;
				231
				232	out[i] = in[i] + in2[i];
				233	out[i] += carry;
				234	carry = out[i] >> 28;
				235	out[i] &= kBottom28Bits;
				236	}
				237
				238	felem_reduce_carry(out, carry);
				239	}
				240
				241	#define two31m3 (((limb)1) << 31) - (((limb)1) << 3)
				242	#define two30m2 (((limb)1) << 30) - (((limb)1) << 2)
				243	#define two30p13m2 (((limb)1) << 30) + (((limb)1) << 13) - (((limb)1) << 2)
				244	#define two31m2 (((limb)1) << 31) - (((limb)1) << 2)
				245	#define two31p24m2 (((limb)1) << 31) + (((limb)1) << 24) - (((limb)1) << 2)
				246	#define two30m27m2 (((limb)1) << 30) - (((limb)1) << 27) - (((limb)1) << 2)
				247
				248	/* zero31 is 0 mod p. */
				249	static const felem zero31 = { two31m3, two30m2, two31m2, two30p13m2, two31m2, two30m2, two31p24m2, two30m27m2, two31m2 };
				250
				251	/* felem_diff sets out = in-in2.
				252	*
				253	* On entry: in[0,2,...] < 230, in[1,3,...] < 229 and
				254	* in2[0,2,...] < 230, in2[1,3,...] < 229.
				255	* On exit: out[0,2,...] < 230, out[1,3,...] < 229. */
				256	static void felem_diff(felem out, const felem in, const felem in2) {
				257	limb carry = 0;
				258	unsigned i;
				259
				260	for (i = 0;; i++) {
				261	out[i] = in[i] - in2[i];
				262	out[i] += zero31[i];
				263	out[i] += carry;
				264	carry = out[i] >> 29;
				265	out[i] &= kBottom29Bits;
				266
				267	i++;
				268	if (i == NLIMBS)
				269	break;
				270
				271	out[i] = in[i] - in2[i];
				272	out[i] += zero31[i];
				273	out[i] += carry;
				274	carry = out[i] >> 28;
				275	out[i] &= kBottom28Bits;
				276	}
				277
				278	felem_reduce_carry(out, carry);
				279	}
				280
				281	/* felem_reduce_degree sets out = tmp/R mod p where tmp contains 64-bit words
				282	* with the same 29,28,... bit positions as an felem.
				283	*
				284	* The values in felems are in Montgomery form: xR mod p where R = 2*257.
				285	* Since we just multiplied two Montgomery values together, the result is
				286	* xyR*R mod p. We wish to divide by R in order for the result also to be
				287	* in Montgomery form.
				288	*
				289	* On entry: tmp[i] < 2**64
				290	* On exit: out[0,2,...] < 230, out[1,3,...] < 229 */
				291	static void felem_reduce_degree(felem out, u64 tmp[17]) {
				292	/* The following table may be helpful when reading this code:
				293	*
				294	* Limb number: 0 \| 1 \| 2 \| 3 \| 4 \| 5 \| 6 \| 7 \| 8 \| 9 \| 10...
				295	* Width (bits): 29\| 28\| 29\| 28\| 29\| 28\| 29\| 28\| 29\| 28\| 29
				296	* Start bit: 0 \| 29\| 57\| 86\|114\|143\|171\|200\|228\|257\|285
				297	* (odd phase): 0 \| 28\| 57\| 85\|114\|142\|171\|199\|228\|256\|285 */
				298	limb tmp2[18], carry, x, xMask;
				299	unsigned i;
				300
				301	/* tmp contains 64-bit words with the same 29,28,29-bit positions as an
				302	* felem. So the top of an element of tmp might overlap with another
				303	* element two positions down. The following loop eliminates this
				304	* overlap. */
				305	tmp2[0] = (limb)(tmp[0] & kBottom29Bits);
				306
				307	/* In the following we use "(limb) tmp[x]" and "(limb) (tmp[x]>>32)" to try
				308	* and hint to the compiler that it can do a single-word shift by selecting
				309	* the right register rather than doing a double-word shift and truncating
				310	* afterwards. */
				311	tmp2[1] = ((limb) tmp[0]) >> 29;
				312	tmp2[1] \|= (((limb)(tmp[0] >> 32)) << 3) & kBottom28Bits;
				313	tmp2[1] += ((limb) tmp[1]) & kBottom28Bits;
				314	carry = tmp2[1] >> 28;
				315	tmp2[1] &= kBottom28Bits;
				316
				317	for (i = 2; i < 17; i++) {
				318	tmp2[i] = ((limb)(tmp[i - 2] >> 32)) >> 25;
				319	tmp2[i] += ((limb)(tmp[i - 1])) >> 28;
				320	tmp2[i] += (((limb)(tmp[i - 1] >> 32)) << 4) & kBottom29Bits;
				321	tmp2[i] += ((limb) tmp[i]) & kBottom29Bits;
				322	tmp2[i] += carry;
				323	carry = tmp2[i] >> 29;
				324	tmp2[i] &= kBottom29Bits;
				325
				326	i++;
				327	if (i == 17)
				328	break;
				329	tmp2[i] = ((limb)(tmp[i - 2] >> 32)) >> 25;
				330	tmp2[i] += ((limb)(tmp[i - 1])) >> 29;
				331	tmp2[i] += (((limb)(tmp[i - 1] >> 32)) << 3) & kBottom28Bits;
				332	tmp2[i] += ((limb) tmp[i]) & kBottom28Bits;
				333	tmp2[i] += carry;
				334	carry = tmp2[i] >> 28;
				335	tmp2[i] &= kBottom28Bits;
				336	}
				337
				338	tmp2[17] = ((limb)(tmp[15] >> 32)) >> 25;
				339	tmp2[17] += ((limb)(tmp[16])) >> 29;
				340	tmp2[17] += (((limb)(tmp[16] >> 32)) << 3);
				341	tmp2[17] += carry;
				342
				343	/* Montgomery elimination of terms.
				344	*
				345	* Since R is 2**257, we can divide by R with a bitwise shift if we can
				346	* ensure that the right-most 257 bits are all zero. We can make that true by
				347	* adding multiplies of p without affecting the value.
				348	*
				349	* So we eliminate limbs from right to left. Since the bottom 29 bits of p
				350	* are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0.
				351	* We can do that for 8 further limbs and then right shift to eliminate the
				352	* extra factor of R. */
				353	for (i = 0;; i += 2) {
				354	tmp2[i + 1] += tmp2[i] >> 29;
				355	x = tmp2[i] & kBottom29Bits;
				356	xMask = NON_ZERO_TO_ALL_ONES(x);
				357	tmp2[i] = 0;
				358
				359	/* The bounds calculations for this loop are tricky. Each iteration of
				360	* the loop eliminates two words by adding values to words to their
				361	* right.
				362	*
				363	* The following table contains the amounts added to each word (as an
				364	* offset from the value of i at the top of the loop). The amounts are
				365	* accounted for from the first and second half of the loop separately
				366	* and are written as, for example, 28 to mean a value <2**28.
				367	*
				368	* Word: 3 4 5 6 7 8 9 10
				369	* Added in top half: 28 11 29 21 29 28
				370	* 28 29
				371	* 29
				372	* Added in bottom half: 29 10 28 21 28 28
				373	* 29
				374	*
				375	* The value that is currently offset 7 will be offset 5 for the next
				376	* iteration and then offset 3 for the iteration after that. Therefore
				377	* the total value added will be the values added at 7, 5 and 3.
				378	*
				379	* The following table accumulates these values. The sums at the bottom
				380	* are written as, for example, 29+28, to mean a value < 229+228.
				381	*
				382	* Word: 3 4 5 6 7 8 9 10 11 12 13
				383	* 28 11 10 29 21 29 28 28 28 28 28
				384	* 29 28 11 28 29 28 29 28 29 28
				385	* 29 28 21 21 29 21 29 21
				386	* 10 29 28 21 28 21 28
				387	* 28 29 28 29 28 29 28
				388	* 11 10 29 10 29 10
				389	* 29 28 11 28 11
				390	* 29 29
				391	* --------------------------------------------
				392	* 30+ 31+ 30+ 31+ 30+
				393	* 28+ 29+ 28+ 29+ 21+
				394	* 21+ 28+ 21+ 28+ 10
				395	* 10 21+ 10 21+
				396	* 11 11
				397	*
				398	* So the greatest amount is added to tmp2[10] and tmp2[12]. If
				399	* tmp2[10/12] has an initial value of <2**29, then the maximum value
				400	* will be < 231 + 230 + 228 + 221 + 211, which is < 232,
				401	* as required. */
				402	tmp2[i + 3] += (x << 10) & kBottom28Bits;
				403	tmp2[i + 4] += (x >> 18);
				404
				405	tmp2[i + 6] += (x << 21) & kBottom29Bits;
				406	tmp2[i + 7] += x >> 8;
				407
				408	/* At position 200, which is the starting bit position for word 7, we
				409	* have a factor of 0xf000000 = 228 - 224. */
				410	tmp2[i + 7] += 0x10000000 & xMask;
				411	/* Word 7 is 28 bits wide, so the 2*28 term exactly hits word 8. /
				412	tmp2[i + 8] += (x - 1) & xMask;
				413	tmp2[i + 7] -= (x << 24) & kBottom28Bits;
				414	tmp2[i + 8] -= x >> 4;
				415
				416	tmp2[i + 8] += 0x20000000 & xMask;
				417	tmp2[i + 8] -= x;
				418	tmp2[i + 8] += (x << 28) & kBottom29Bits;
				419	tmp2[i + 9] += ((x >> 1) - 1) & xMask;
				420
				421	if (i+1 == NLIMBS)
				422	break;
				423	tmp2[i + 2] += tmp2[i + 1] >> 28;
				424	x = tmp2[i + 1] & kBottom28Bits;
				425	xMask = NON_ZERO_TO_ALL_ONES(x);
				426	tmp2[i + 1] = 0;
				427
				428	tmp2[i + 4] += (x << 11) & kBottom29Bits;
				429	tmp2[i + 5] += (x >> 18);
				430
				431	tmp2[i + 7] += (x << 21) & kBottom28Bits;
				432	tmp2[i + 8] += x >> 7;
				433
				434	/* At position 199, which is the starting bit of the 8th word when
				435	* dealing with a context starting on an odd word, we have a factor of
				436	* 0x1e000000 = 229 - 225. Since we have not updated i, the 8th
				437	* word from i+1 is i+8. */
				438	tmp2[i + 8] += 0x20000000 & xMask;
				439	tmp2[i + 9] += (x - 1) & xMask;
				440	tmp2[i + 8] -= (x << 25) & kBottom29Bits;
				441	tmp2[i + 9] -= x >> 4;
				442
				443	tmp2[i + 9] += 0x10000000 & xMask;
				444	tmp2[i + 9] -= x;
				445	tmp2[i + 10] += (x - 1) & xMask;
				446	}
				447
				448	/* We merge the right shift with a carry chain. The words above 2**257 have
				449	* widths of 28,29,... which we need to correct when copying them down. */
				450	carry = 0;
				451	for (i = 0; i < 8; i++) {
				452	/* The maximum value of tmp2[i + 9] occurs on the first iteration and
				453	* is < 230+229+228. Adding 229 (from tmp2[i + 10]) is
				454	* therefore safe. */
				455	out[i] = tmp2[i + 9];
				456	out[i] += carry;
				457	out[i] += (tmp2[i + 10] << 28) & kBottom29Bits;
				458	carry = out[i] >> 29;
				459	out[i] &= kBottom29Bits;
				460
				461	i++;
				462	out[i] = tmp2[i + 9] >> 1;
				463	out[i] += carry;
				464	carry = out[i] >> 28;
				465	out[i] &= kBottom28Bits;
				466	}
				467
				468	out[8] = tmp2[17];
				469	out[8] += carry;
				470	carry = out[8] >> 29;
				471	out[8] &= kBottom29Bits;
				472
				473	felem_reduce_carry(out, carry);
				474	}
				475
				476	/* felem_square sets out=in*in.
				477	*
				478	* On entry: in[0,2,...] < 230, in[1,3,...] < 229.
				479	* On exit: out[0,2,...] < 230, out[1,3,...] < 229. */
				480	static void felem_square(felem out, const felem in) {
				481	u64 tmp[17];
				482
				483	tmp[0] = ((u64) in[0]) * in[0];
				484	tmp[1] = ((u64) in[0]) * (in[1] << 1);
				485	tmp[2] = ((u64) in[0]) * (in[2] << 1) +
				486	((u64) in[1]) * (in[1] << 1);
				487	tmp[3] = ((u64) in[0]) * (in[3] << 1) +
				488	((u64) in[1]) * (in[2] << 1);
				489	tmp[4] = ((u64) in[0]) * (in[4] << 1) +
				490	((u64) in[1]) * (in[3] << 2) + ((u64) in[2]) * in[2];
				491	tmp[5] = ((u64) in[0]) * (in[5] << 1) + ((u64) in[1]) *
				492	(in[4] << 1) + ((u64) in[2]) * (in[3] << 1);
				493	tmp[6] = ((u64) in[0]) * (in[6] << 1) + ((u64) in[1]) *
				494	(in[5] << 2) + ((u64) in[2]) * (in[4] << 1) +
				495	((u64) in[3]) * (in[3] << 1);
				496	tmp[7] = ((u64) in[0]) * (in[7] << 1) + ((u64) in[1]) *
				497	(in[6] << 1) + ((u64) in[2]) * (in[5] << 1) +
				498	((u64) in[3]) * (in[4] << 1);
				499	/* tmp[8] has the greatest value of 261 + 260 + 261 + 260 + 2**60,
				500	* which is < 2*64 as required. /
				501	tmp[8] = ((u64) in[0]) * (in[8] << 1) + ((u64) in[1]) *
				502	(in[7] << 2) + ((u64) in[2]) * (in[6] << 1) +
				503	((u64) in[3]) * (in[5] << 2) + ((u64) in[4]) * in[4];
				504	tmp[9] = ((u64) in[1]) * (in[8] << 1) + ((u64) in[2]) *
				505	(in[7] << 1) + ((u64) in[3]) * (in[6] << 1) +
				506	((u64) in[4]) * (in[5] << 1);
				507	tmp[10] = ((u64) in[2]) * (in[8] << 1) + ((u64) in[3]) *
				508	(in[7] << 2) + ((u64) in[4]) * (in[6] << 1) +
				509	((u64) in[5]) * (in[5] << 1);
				510	tmp[11] = ((u64) in[3]) * (in[8] << 1) + ((u64) in[4]) *
				511	(in[7] << 1) + ((u64) in[5]) * (in[6] << 1);
				512	tmp[12] = ((u64) in[4]) * (in[8] << 1) +
				513	((u64) in[5]) * (in[7] << 2) + ((u64) in[6]) * in[6];
				514	tmp[13] = ((u64) in[5]) * (in[8] << 1) +
				515	((u64) in[6]) * (in[7] << 1);
				516	tmp[14] = ((u64) in[6]) * (in[8] << 1) +
				517	((u64) in[7]) * (in[7] << 1);
				518	tmp[15] = ((u64) in[7]) * (in[8] << 1);
				519	tmp[16] = ((u64) in[8]) * in[8];
				520
				521	felem_reduce_degree(out, tmp);
				522	}
				523
				524	/* felem_mul sets out=in*in2.
				525	*
				526	* On entry: in[0,2,...] < 230, in[1,3,...] < 229 and
				527	* in2[0,2,...] < 230, in2[1,3,...] < 229.
				528	* On exit: out[0,2,...] < 230, out[1,3,...] < 229. */
				529	static void felem_mul(felem out, const felem in, const felem in2) {
				530	u64 tmp[17];
				531
				532	tmp[0] = ((u64) in[0]) * in2[0];
				533	tmp[1] = ((u64) in[0]) * (in2[1] << 0) +
				534	((u64) in[1]) * (in2[0] << 0);
				535	tmp[2] = ((u64) in[0]) * (in2[2] << 0) + ((u64) in[1]) *
				536	(in2[1] << 1) + ((u64) in[2]) * (in2[0] << 0);
				537	tmp[3] = ((u64) in[0]) * (in2[3] << 0) + ((u64) in[1]) *
				538	(in2[2] << 0) + ((u64) in[2]) * (in2[1] << 0) +
				539	((u64) in[3]) * (in2[0] << 0);
				540	tmp[4] = ((u64) in[0]) * (in2[4] << 0) + ((u64) in[1]) *
				541	(in2[3] << 1) + ((u64) in[2]) * (in2[2] << 0) +
				542	((u64) in[3]) * (in2[1] << 1) +
				543	((u64) in[4]) * (in2[0] << 0);
				544	tmp[5] = ((u64) in[0]) * (in2[5] << 0) + ((u64) in[1]) *
				545	(in2[4] << 0) + ((u64) in[2]) * (in2[3] << 0) +
				546	((u64) in[3]) * (in2[2] << 0) + ((u64) in[4]) *
				547	(in2[1] << 0) + ((u64) in[5]) * (in2[0] << 0);
				548	tmp[6] = ((u64) in[0]) * (in2[6] << 0) + ((u64) in[1]) *
				549	(in2[5] << 1) + ((u64) in[2]) * (in2[4] << 0) +
				550	((u64) in[3]) * (in2[3] << 1) + ((u64) in[4]) *
				551	(in2[2] << 0) + ((u64) in[5]) * (in2[1] << 1) +
				552	((u64) in[6]) * (in2[0] << 0);
				553	tmp[7] = ((u64) in[0]) * (in2[7] << 0) + ((u64) in[1]) *
				554	(in2[6] << 0) + ((u64) in[2]) * (in2[5] << 0) +
				555	((u64) in[3]) * (in2[4] << 0) + ((u64) in[4]) *
				556	(in2[3] << 0) + ((u64) in[5]) * (in2[2] << 0) +
				557	((u64) in[6]) * (in2[1] << 0) +
				558	((u64) in[7]) * (in2[0] << 0);
				559	/* tmp[8] has the greatest value but doesn't overflow. See logic in
				560	* felem_square. */
				561	tmp[8] = ((u64) in[0]) * (in2[8] << 0) + ((u64) in[1]) *
				562	(in2[7] << 1) + ((u64) in[2]) * (in2[6] << 0) +
				563	((u64) in[3]) * (in2[5] << 1) + ((u64) in[4]) *
				564	(in2[4] << 0) + ((u64) in[5]) * (in2[3] << 1) +
				565	((u64) in[6]) * (in2[2] << 0) + ((u64) in[7]) *
				566	(in2[1] << 1) + ((u64) in[8]) * (in2[0] << 0);
				567	tmp[9] = ((u64) in[1]) * (in2[8] << 0) + ((u64) in[2]) *
				568	(in2[7] << 0) + ((u64) in[3]) * (in2[6] << 0) +
				569	((u64) in[4]) * (in2[5] << 0) + ((u64) in[5]) *
				570	(in2[4] << 0) + ((u64) in[6]) * (in2[3] << 0) +
				571	((u64) in[7]) * (in2[2] << 0) +
				572	((u64) in[8]) * (in2[1] << 0);
				573	tmp[10] = ((u64) in[2]) * (in2[8] << 0) + ((u64) in[3]) *
				574	(in2[7] << 1) + ((u64) in[4]) * (in2[6] << 0) +
				575	((u64) in[5]) * (in2[5] << 1) + ((u64) in[6]) *
				576	(in2[4] << 0) + ((u64) in[7]) * (in2[3] << 1) +
				577	((u64) in[8]) * (in2[2] << 0);
				578	tmp[11] = ((u64) in[3]) * (in2[8] << 0) + ((u64) in[4]) *
				579	(in2[7] << 0) + ((u64) in[5]) * (in2[6] << 0) +
				580	((u64) in[6]) * (in2[5] << 0) + ((u64) in[7]) *
				581	(in2[4] << 0) + ((u64) in[8]) * (in2[3] << 0);
				582	tmp[12] = ((u64) in[4]) * (in2[8] << 0) + ((u64) in[5]) *
				583	(in2[7] << 1) + ((u64) in[6]) * (in2[6] << 0) +
				584	((u64) in[7]) * (in2[5] << 1) +
				585	((u64) in[8]) * (in2[4] << 0);
				586	tmp[13] = ((u64) in[5]) * (in2[8] << 0) + ((u64) in[6]) *
				587	(in2[7] << 0) + ((u64) in[7]) * (in2[6] << 0) +
				588	((u64) in[8]) * (in2[5] << 0);
				589	tmp[14] = ((u64) in[6]) * (in2[8] << 0) + ((u64) in[7]) *
				590	(in2[7] << 1) + ((u64) in[8]) * (in2[6] << 0);
				591	tmp[15] = ((u64) in[7]) * (in2[8] << 0) +
				592	((u64) in[8]) * (in2[7] << 0);
				593	tmp[16] = ((u64) in[8]) * (in2[8] << 0);
				594
				595	felem_reduce_degree(out, tmp);
				596	}
				597
				598	static void felem_assign(felem out, const felem in) {
				599	memcpy(out, in, sizeof(felem));
				600	}
				601
				602	/* felem_inv calculates \|out\| = \|in\|^{-1}
				603	*
				604	* Based on Fermat's Little Theorem:
				605	* a^p = a (mod p)
				606	* a^{p-1} = 1 (mod p)
				607	* a^{p-2} = a^{-1} (mod p)
				608	*/
				609	static void felem_inv(felem out, const felem in) {
				610	felem ftmp, ftmp2;
				611	/* each e_I will hold \|in\|^{2^I - 1} */
				612	felem e2, e4, e8, e16, e32, e64;
				613	unsigned i;
				614
				615	felem_square(ftmp, in); /* 2^1 */
				616	felem_mul(ftmp, in, ftmp); /* 2^2 - 2^0 */
				617	felem_assign(e2, ftmp);
				618	felem_square(ftmp, ftmp); /* 2^3 - 2^1 */
				619	felem_square(ftmp, ftmp); /* 2^4 - 2^2 */
				620	felem_mul(ftmp, ftmp, e2); /* 2^4 - 2^0 */
				621	felem_assign(e4, ftmp);
				622	felem_square(ftmp, ftmp); /* 2^5 - 2^1 */
				623	felem_square(ftmp, ftmp); /* 2^6 - 2^2 */
				624	felem_square(ftmp, ftmp); /* 2^7 - 2^3 */
				625	felem_square(ftmp, ftmp); /* 2^8 - 2^4 */
				626	felem_mul(ftmp, ftmp, e4); /* 2^8 - 2^0 */
				627	felem_assign(e8, ftmp);
				628	for (i = 0; i < 8; i++) {
				629	felem_square(ftmp, ftmp);
				630	} /* 2^16 - 2^8 */
				631	felem_mul(ftmp, ftmp, e8); /* 2^16 - 2^0 */
				632	felem_assign(e16, ftmp);
				633	for (i = 0; i < 16; i++) {
				634	felem_square(ftmp, ftmp);
				635	} /* 2^32 - 2^16 */
				636	felem_mul(ftmp, ftmp, e16); /* 2^32 - 2^0 */
				637	felem_assign(e32, ftmp);
				638	for (i = 0; i < 32; i++) {
				639	felem_square(ftmp, ftmp);
				640	} /* 2^64 - 2^32 */
				641	felem_assign(e64, ftmp);
				642	felem_mul(ftmp, ftmp, in); /* 2^64 - 2^32 + 2^0 */
				643	for (i = 0; i < 192; i++) {
				644	felem_square(ftmp, ftmp);
				645	} /* 2^256 - 2^224 + 2^192 */
				646
				647	felem_mul(ftmp2, e64, e32); /* 2^64 - 2^0 */
				648	for (i = 0; i < 16; i++) {
				649	felem_square(ftmp2, ftmp2);
				650	} /* 2^80 - 2^16 */
				651	felem_mul(ftmp2, ftmp2, e16); /* 2^80 - 2^0 */
				652	for (i = 0; i < 8; i++) {
				653	felem_square(ftmp2, ftmp2);
				654	} /* 2^88 - 2^8 */
				655	felem_mul(ftmp2, ftmp2, e8); /* 2^88 - 2^0 */
				656	for (i = 0; i < 4; i++) {
				657	felem_square(ftmp2, ftmp2);
				658	} /* 2^92 - 2^4 */
				659	felem_mul(ftmp2, ftmp2, e4); /* 2^92 - 2^0 */
				660	felem_square(ftmp2, ftmp2); /* 2^93 - 2^1 */
				661	felem_square(ftmp2, ftmp2); /* 2^94 - 2^2 */
				662	felem_mul(ftmp2, ftmp2, e2); /* 2^94 - 2^0 */
				663	felem_square(ftmp2, ftmp2); /* 2^95 - 2^1 */
				664	felem_square(ftmp2, ftmp2); /* 2^96 - 2^2 */
				665	felem_mul(ftmp2, ftmp2, in); /* 2^96 - 3 */
				666
				667	felem_mul(out, ftmp2, ftmp); /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
				668	}
				669
				670	/* felem_scalar_3 sets out=3*out.
				671	*
				672	* On entry: out[0,2,...] < 230, out[1,3,...] < 229.
				673	* On exit: out[0,2,...] < 230, out[1,3,...] < 229. */
				674	static void felem_scalar_3(felem out) {
				675	limb carry = 0;
				676	unsigned i;
				677
				678	for (i = 0;; i++) {
				679	out[i] *= 3;
				680	out[i] += carry;
				681	carry = out[i] >> 29;
				682	out[i] &= kBottom29Bits;
				683
				684	i++;
				685	if (i == NLIMBS)
				686	break;
				687
				688	out[i] *= 3;
				689	out[i] += carry;
				690	carry = out[i] >> 28;
				691	out[i] &= kBottom28Bits;
				692	}
				693
				694	felem_reduce_carry(out, carry);
				695	}
				696
				697	/* felem_scalar_4 sets out=4*out.
				698	*
				699	* On entry: out[0,2,...] < 230, out[1,3,...] < 229.
				700	* On exit: out[0,2,...] < 230, out[1,3,...] < 229. */
				701	static void felem_scalar_4(felem out) {
				702	limb carry = 0, next_carry;
				703	unsigned i;
				704
				705	for (i = 0;; i++) {
				706	next_carry = out[i] >> 27;
				707	out[i] <<= 2;
				708	out[i] &= kBottom29Bits;
				709	out[i] += carry;
				710	carry = next_carry + (out[i] >> 29);
				711	out[i] &= kBottom29Bits;
				712
				713	i++;
				714	if (i == NLIMBS)
				715	break;
				716
				717	next_carry = out[i] >> 26;
				718	out[i] <<= 2;
				719	out[i] &= kBottom28Bits;
				720	out[i] += carry;
				721	carry = next_carry + (out[i] >> 28);
				722	out[i] &= kBottom28Bits;
				723	}
				724
				725	felem_reduce_carry(out, carry);
				726	}
				727
				728	/* felem_scalar_8 sets out=8*out.
				729	*
				730	* On entry: out[0,2,...] < 230, out[1,3,...] < 229.
				731	* On exit: out[0,2,...] < 230, out[1,3,...] < 229. */
				732	static void felem_scalar_8(felem out) {
				733	limb carry = 0, next_carry;
				734	unsigned i;
				735
				736	for (i = 0;; i++) {
				737	next_carry = out[i] >> 26;
				738	out[i] <<= 3;
				739	out[i] &= kBottom29Bits;
				740	out[i] += carry;
				741	carry = next_carry + (out[i] >> 29);
				742	out[i] &= kBottom29Bits;
				743
				744	i++;
				745	if (i == NLIMBS)
				746	break;
				747
				748	next_carry = out[i] >> 25;
				749	out[i] <<= 3;
				750	out[i] &= kBottom28Bits;
				751	out[i] += carry;
				752	carry = next_carry + (out[i] >> 28);
				753	out[i] &= kBottom28Bits;
				754	}
				755
				756	felem_reduce_carry(out, carry);
				757	}
				758
				759	/* felem_is_zero_vartime returns 1 iff \|in\| == 0. It takes a variable amount of
				760	* time depending on the value of \|in\|. */
				761	static char felem_is_zero_vartime(const felem in) {
				762	limb carry;
				763	int i;
				764	limb tmp[NLIMBS];
				765
				766	felem_assign(tmp, in);
				767
				768	/* First, reduce tmp to a minimal form. */
				769	do {
				770	carry = 0;
				771	for (i = 0;; i++) {
				772	tmp[i] += carry;
				773	carry = tmp[i] >> 29;
				774	tmp[i] &= kBottom29Bits;
				775
				776	i++;
				777	if (i == NLIMBS)
				778	break;
				779
				780	tmp[i] += carry;
				781	carry = tmp[i] >> 28;
				782	tmp[i] &= kBottom28Bits;
				783	}
				784
				785	felem_reduce_carry(tmp, carry);
				786	} while (carry);
				787
				788	/* tmp < 2*257, so the only possible zero values are 0, p and 2p. /
				789	return memcmp(tmp, kZero, sizeof(tmp)) == 0 \|\|
				790	memcmp(tmp, kP, sizeof(tmp)) == 0 \|\|
				791	memcmp(tmp, k2P, sizeof(tmp)) == 0;
				792	}
				793
				794
				795	/* Group operations:
				796	*
				797	* Elements of the elliptic curve group are represented in Jacobian
				798	* coordinates: (x, y, z). An affine point (x', y') is x'=x/z2, y'=y/z3 in
				799	* Jacobian form. */
				800
				801	/* point_double sets {x_out,y_out,z_out} = 2*{x,y,z}.
				802	*
				803	* See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l */
				804	static void point_double(felem x_out, felem y_out, felem z_out, const felem x,
				805	const felem y, const felem z) {
				806	felem delta, gamma, alpha, beta, tmp, tmp2;
				807
				808	felem_square(delta, z);
				809	felem_square(gamma, y);
				810	felem_mul(beta, x, gamma);
				811
				812	felem_sum(tmp, x, delta);
				813	felem_diff(tmp2, x, delta);
				814	felem_mul(alpha, tmp, tmp2);
				815	felem_scalar_3(alpha);
				816
				817	felem_sum(tmp, y, z);
				818	felem_square(tmp, tmp);
				819	felem_diff(tmp, tmp, gamma);
				820	felem_diff(z_out, tmp, delta);
				821
				822	felem_scalar_4(beta);
				823	felem_square(x_out, alpha);
				824	felem_diff(x_out, x_out, beta);
				825	felem_diff(x_out, x_out, beta);
				826
				827	felem_diff(tmp, beta, x_out);
				828	felem_mul(tmp, alpha, tmp);
				829	felem_square(tmp2, gamma);
				830	felem_scalar_8(tmp2);
				831	felem_diff(y_out, tmp, tmp2);
				832	}
				833
				834	/* point_add_mixed sets {x_out,y_out,z_out} = {x1,y1,z1} + {x2,y2,1}.
				835	* (i.e. the second point is affine.)
				836	*
				837	* See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
				838	*
				839	* Note that this function does not handle P+P, infinity+P nor P+infinity
				840	* correctly. */
				841	static void point_add_mixed(felem x_out, felem y_out, felem z_out,
				842	const felem x1, const felem y1, const felem z1,
				843	const felem x2, const felem y2) {
				844	felem z1z1, z1z1z1, s2, u2, h, i, j, r, rr, v, tmp;
				845
				846	felem_square(z1z1, z1);
				847	felem_sum(tmp, z1, z1);
				848
				849	felem_mul(u2, x2, z1z1);
				850	felem_mul(z1z1z1, z1, z1z1);
				851	felem_mul(s2, y2, z1z1z1);
				852	felem_diff(h, u2, x1);
				853	felem_sum(i, h, h);
				854	felem_square(i, i);
				855	felem_mul(j, h, i);
				856	felem_diff(r, s2, y1);
				857	felem_sum(r, r, r);
				858	felem_mul(v, x1, i);
				859
				860	felem_mul(z_out, tmp, h);
				861	felem_square(rr, r);
				862	felem_diff(x_out, rr, j);
				863	felem_diff(x_out, x_out, v);
				864	felem_diff(x_out, x_out, v);
				865
				866	felem_diff(tmp, v, x_out);
				867	felem_mul(y_out, tmp, r);
				868	felem_mul(tmp, y1, j);
				869	felem_diff(y_out, y_out, tmp);
				870	felem_diff(y_out, y_out, tmp);
				871	}
				872
				873	/* point_add sets {x_out,y_out,z_out} = {x1,y1,z1} + {x2,y2,z2}.
				874	*
				875	* See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
				876	*
				877	* Note that this function does not handle P+P, infinity+P nor P+infinity
				878	* correctly. */
				879	static void point_add(felem x_out, felem y_out, felem z_out, const felem x1,
				880	const felem y1, const felem z1, const felem x2,
				881	const felem y2, const felem z2) {
				882	felem z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp;
				883
				884	felem_square(z1z1, z1);
				885	felem_square(z2z2, z2);
				886	felem_mul(u1, x1, z2z2);
				887
				888	felem_sum(tmp, z1, z2);
				889	felem_square(tmp, tmp);
				890	felem_diff(tmp, tmp, z1z1);
				891	felem_diff(tmp, tmp, z2z2);
				892
				893	felem_mul(z2z2z2, z2, z2z2);
				894	felem_mul(s1, y1, z2z2z2);
				895
				896	felem_mul(u2, x2, z1z1);
				897	felem_mul(z1z1z1, z1, z1z1);
				898	felem_mul(s2, y2, z1z1z1);
				899	felem_diff(h, u2, u1);
				900	felem_sum(i, h, h);
				901	felem_square(i, i);
				902	felem_mul(j, h, i);
				903	felem_diff(r, s2, s1);
				904	felem_sum(r, r, r);
				905	felem_mul(v, u1, i);
				906
				907	felem_mul(z_out, tmp, h);
				908	felem_square(rr, r);
				909	felem_diff(x_out, rr, j);
				910	felem_diff(x_out, x_out, v);
				911	felem_diff(x_out, x_out, v);
				912
				913	felem_diff(tmp, v, x_out);
				914	felem_mul(y_out, tmp, r);
				915	felem_mul(tmp, s1, j);
				916	felem_diff(y_out, y_out, tmp);
				917	felem_diff(y_out, y_out, tmp);
				918	}
				919
				920	/* point_add_or_double_vartime sets {x_out,y_out,z_out} = {x1,y1,z1} +
				921	* {x2,y2,z2}.
				922	*
				923	* See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
				924	*
				925	* This function handles the case where {x1,y1,z1}={x2,y2,z2}. */
				926	static void point_add_or_double_vartime(
				927	felem x_out, felem y_out, felem z_out, const felem x1, const felem y1,
				928	const felem z1, const felem x2, const felem y2, const felem z2) {
				929	felem z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp;
				930	char x_equal, y_equal;
				931
				932	felem_square(z1z1, z1);
				933	felem_square(z2z2, z2);
				934	felem_mul(u1, x1, z2z2);
				935
				936	felem_sum(tmp, z1, z2);
				937	felem_square(tmp, tmp);
				938	felem_diff(tmp, tmp, z1z1);
				939	felem_diff(tmp, tmp, z2z2);
				940
				941	felem_mul(z2z2z2, z2, z2z2);
				942	felem_mul(s1, y1, z2z2z2);
				943
				944	felem_mul(u2, x2, z1z1);
				945	felem_mul(z1z1z1, z1, z1z1);
				946	felem_mul(s2, y2, z1z1z1);
				947	felem_diff(h, u2, u1);
				948	x_equal = felem_is_zero_vartime(h);
				949	felem_sum(i, h, h);
				950	felem_square(i, i);
				951	felem_mul(j, h, i);
				952	felem_diff(r, s2, s1);
				953	y_equal = felem_is_zero_vartime(r);
				954	if (x_equal && y_equal) {
				955	point_double(x_out, y_out, z_out, x1, y1, z1);
				956	return;
				957	}
				958	felem_sum(r, r, r);
				959	felem_mul(v, u1, i);
				960
				961	felem_mul(z_out, tmp, h);
				962	felem_square(rr, r);
				963	felem_diff(x_out, rr, j);
				964	felem_diff(x_out, x_out, v);
				965	felem_diff(x_out, x_out, v);
				966
				967	felem_diff(tmp, v, x_out);
				968	felem_mul(y_out, tmp, r);
				969	felem_mul(tmp, s1, j);
				970	felem_diff(y_out, y_out, tmp);
				971	felem_diff(y_out, y_out, tmp);
				972	}
				973
				974	/* copy_conditional sets out=in if mask = 0xffffffff in constant time.
				975	*
				976	* On entry: mask is either 0 or 0xffffffff. */
				977	static void copy_conditional(felem out, const felem in, limb mask) {
				978	int i;
				979
				980	for (i = 0; i < NLIMBS; i++) {
				981	const limb tmp = mask & (in[i] ^ out[i]);
				982	out[i] ^= tmp;
				983	}
				984	}
				985
				986	/* select_affine_point sets {out_x,out_y} to the index'th entry of table.
				987	* On entry: index < 16, table[0] must be zero. */
				988	static void select_affine_point(felem out_x, felem out_y, const limb* table,
				989	limb index) {
				990	limb i, j;
				991
				992	memset(out_x, 0, sizeof(felem));
				993	memset(out_y, 0, sizeof(felem));
				994
				995	for (i = 1; i < 16; i++) {
				996	limb mask = i ^ index;
				997	mask \|= mask >> 2;
				998	mask \|= mask >> 1;
				999	mask &= 1;
				1000	mask--;
				1001	for (j = 0; j < NLIMBS; j++, table++) {
				1002	out_x[j] \|= *table & mask;
				1003	}
				1004	for (j = 0; j < NLIMBS; j++, table++) {
				1005	out_y[j] \|= *table & mask;
				1006	}
				1007	}
				1008	}
				1009
				1010	/* select_jacobian_point sets {out_x,out_y,out_z} to the index'th entry of
				1011	* table. On entry: index < 16, table[0] must be zero. */
				1012	static void select_jacobian_point(felem out_x, felem out_y, felem out_z,
				1013	const limb* table, limb index) {
				1014	limb i, j;
				1015
				1016	memset(out_x, 0, sizeof(felem));
				1017	memset(out_y, 0, sizeof(felem));
				1018	memset(out_z, 0, sizeof(felem));
				1019
				1020	/* The implicit value at index 0 is all zero. We don't need to perform that
				1021	* iteration of the loop because we already set out_* to zero. */
				1022	table += 3 * NLIMBS;
				1023
				1024	// Hit all entries to obscure cache profiling.
				1025	for (i = 1; i < 16; i++) {
				1026	limb mask = i ^ index;
				1027	mask \|= mask >> 2;
				1028	mask \|= mask >> 1;
				1029	mask &= 1;
				1030	mask--;
				1031	for (j = 0; j < NLIMBS; j++, table++) {
				1032	out_x[j] \|= *table & mask;
				1033	}
				1034	for (j = 0; j < NLIMBS; j++, table++) {
				1035	out_y[j] \|= *table & mask;
				1036	}
				1037	for (j = 0; j < NLIMBS; j++, table++) {
				1038	out_z[j] \|= *table & mask;
				1039	}
				1040	}
				1041	}
				1042
				1043	/* scalar_base_mult sets {nx,ny,nz} = scalar*G where scalar is a little-endian
				1044	* number. Note that the value of scalar must be less than the order of the
				1045	* group. */
				1046	static void scalar_base_mult(felem nx, felem ny, felem nz,
				1047	const p256_int* scalar) {
				1048	int i, j;
				1049	limb n_is_infinity_mask = -1, p_is_noninfinite_mask, mask;
				1050	u32 table_offset;
				1051
				1052	felem px, py;
				1053	felem tx, ty, tz;
				1054
				1055	memset(nx, 0, sizeof(felem));
				1056	memset(ny, 0, sizeof(felem));
				1057	memset(nz, 0, sizeof(felem));
				1058
				1059	/* The loop adds bits at positions 0, 64, 128 and 192, followed by
				1060	* positions 32,96,160 and 224 and does this 32 times. */
				1061	for (i = 0; i < 32; i++) {
				1062	if (i) {
				1063	point_double(nx, ny, nz, nx, ny, nz);
				1064	}
				1065	table_offset = 0;
				1066	for (j = 0; j <= 32; j += 32) {
				1067	char bit0 = p256_get_bit(scalar, 31 - i + j);
				1068	char bit1 = p256_get_bit(scalar, 95 - i + j);
				1069	char bit2 = p256_get_bit(scalar, 159 - i + j);
				1070	char bit3 = p256_get_bit(scalar, 223 - i + j);
				1071	limb index = bit0 \| (bit1 << 1) \| (bit2 << 2) \| (bit3 << 3);
				1072
				1073	select_affine_point(px, py, kPrecomputed + table_offset, index);
				1074	table_offset += 30 * NLIMBS;
				1075
				1076	/* Since scalar is less than the order of the group, we know that
				1077	* {nx,ny,nz} != {px,py,1}, unless both are zero, which we handle
				1078	* below. */
				1079	point_add_mixed(tx, ty, tz, nx, ny, nz, px, py);
				1080	/* The result of point_add_mixed is incorrect if {nx,ny,nz} is zero
				1081	* (a.k.a. the point at infinity). We handle that situation by
				1082	* copying the point from the table. */
				1083	copy_conditional(nx, px, n_is_infinity_mask);
				1084	copy_conditional(ny, py, n_is_infinity_mask);
				1085	copy_conditional(nz, kOne, n_is_infinity_mask);
				1086
				1087	/* Equally, the result is also wrong if the point from the table is
				1088	* zero, which happens when the index is zero. We handle that by
				1089	* only copying from {tx,ty,tz} to {nx,ny,nz} if index != 0. */
				1090	p_is_noninfinite_mask = NON_ZERO_TO_ALL_ONES(index);
				1091	mask = p_is_noninfinite_mask & ~n_is_infinity_mask;
				1092	copy_conditional(nx, tx, mask);
				1093	copy_conditional(ny, ty, mask);
				1094	copy_conditional(nz, tz, mask);
				1095	/* If p was not zero, then n is now non-zero. */
				1096	n_is_infinity_mask &= ~p_is_noninfinite_mask;
				1097	}
				1098	}
				1099	}
				1100
				1101	/* point_to_affine converts a Jacobian point to an affine point. If the input
				1102	* is the point at infinity then it returns (0, 0) in constant time. */
				1103	static void point_to_affine(felem x_out, felem y_out, const felem nx,
				1104	const felem ny, const felem nz) {
				1105	felem z_inv, z_inv_sq;
				1106	felem_inv(z_inv, nz);
				1107	felem_square(z_inv_sq, z_inv);
				1108	felem_mul(x_out, nx, z_inv_sq);
				1109	felem_mul(z_inv, z_inv, z_inv_sq);
				1110	felem_mul(y_out, ny, z_inv);
				1111	}
				1112
				1113	/* scalar_base_mult sets {nx,ny,nz} = scalar{x,y}. /
				1114	static void scalar_mult(felem nx, felem ny, felem nz, const felem x,
				1115	const felem y, const p256_int* scalar) {
				1116	int i;
				1117	felem px, py, pz, tx, ty, tz;
				1118	felem precomp[16][3];
				1119	limb n_is_infinity_mask, index, p_is_noninfinite_mask, mask;
				1120
				1121	/* We precompute 0,1,2,... times {x,y}. */
				1122	memset(precomp, 0, sizeof(felem) * 3);
				1123	memcpy(&precomp[1][0], x, sizeof(felem));
				1124	memcpy(&precomp[1][1], y, sizeof(felem));
				1125	memcpy(&precomp[1][2], kOne, sizeof(felem));
				1126
				1127	for (i = 2; i < 16; i += 2) {
				1128	point_double(precomp[i][0], precomp[i][1], precomp[i][2],
				1129	precomp[i / 2][0], precomp[i / 2][1], precomp[i / 2][2]);
				1130
				1131	point_add_mixed(precomp[i + 1][0], precomp[i + 1][1], precomp[i + 1][2],
				1132	precomp[i][0], precomp[i][1], precomp[i][2], x, y);
				1133	}
				1134
				1135	memset(nx, 0, sizeof(felem));
				1136	memset(ny, 0, sizeof(felem));
				1137	memset(nz, 0, sizeof(felem));
				1138	n_is_infinity_mask = -1;
				1139
				1140	/* We add in a window of four bits each iteration and do this 64 times. */
				1141	for (i = 0; i < 256; i += 4) {
				1142	if (i) {
				1143	point_double(nx, ny, nz, nx, ny, nz);
				1144	point_double(nx, ny, nz, nx, ny, nz);
				1145	point_double(nx, ny, nz, nx, ny, nz);
				1146	point_double(nx, ny, nz, nx, ny, nz);
				1147	}
				1148
				1149	index = (p256_get_bit(scalar, 255 - i - 0) << 3) \|
				1150	(p256_get_bit(scalar, 255 - i - 1) << 2) \|
				1151	(p256_get_bit(scalar, 255 - i - 2) << 1) \|
				1152	p256_get_bit(scalar, 255 - i - 3);
				1153
				1154	/* See the comments in scalar_base_mult about handling infinities. */
				1155	select_jacobian_point(px, py, pz, precomp[0][0], index);
				1156	point_add(tx, ty, tz, nx, ny, nz, px, py, pz);
				1157	copy_conditional(nx, px, n_is_infinity_mask);
				1158	copy_conditional(ny, py, n_is_infinity_mask);
				1159	copy_conditional(nz, pz, n_is_infinity_mask);
				1160
				1161	p_is_noninfinite_mask = NON_ZERO_TO_ALL_ONES(index);
				1162	mask = p_is_noninfinite_mask & ~n_is_infinity_mask;
				1163
				1164	copy_conditional(nx, tx, mask);
				1165	copy_conditional(ny, ty, mask);
				1166	copy_conditional(nz, tz, mask);
				1167	n_is_infinity_mask &= ~p_is_noninfinite_mask;
				1168	}
				1169	}
				1170
				1171	#define kRDigits {2, 0, 0, 0xfffffffe, 0xffffffff, 0xffffffff, 0xfffffffd, 1} // 2^257 mod p256.p
				1172
				1173	#define kRInvDigits {0x80000000, 1, 0xffffffff, 0, 0x80000001, 0xfffffffe, 1, 0x7fffffff} // 1 / 2^257 mod p256.p
				1174
				1175	static const p256_int kR = { kRDigits };
				1176	static const p256_int kRInv = { kRInvDigits };
				1177
				1178	/* to_montgomery sets out = Rin. /
				1179	static void to_montgomery(felem out, const p256_int* in) {
				1180	p256_int in_shifted;
				1181	int i;
				1182
				1183	p256_init(&in_shifted);
				1184	p256_modmul(&SECP256r1_p, in, 0, &kR, &in_shifted);
				1185
				1186	for (i = 0; i < NLIMBS; i++) {
				1187	if ((i & 1) == 0) {
				1188	out[i] = P256_DIGIT(&in_shifted, 0) & kBottom29Bits;
				1189	p256_shr(&in_shifted, 29, &in_shifted);
				1190	} else {
				1191	out[i] = P256_DIGIT(&in_shifted, 0) & kBottom28Bits;
				1192	p256_shr(&in_shifted, 28, &in_shifted);
				1193	}
				1194	}
				1195
				1196	p256_clear(&in_shifted);
				1197	}
				1198
				1199	/* from_montgomery sets out=in/R. */
				1200	static void from_montgomery(p256_int* out, const felem in) {
				1201	p256_int result, tmp;
				1202	int i, top;
				1203
				1204	p256_init(&result);
				1205	p256_init(&tmp);
				1206
				1207	p256_add_d(&tmp, in[NLIMBS - 1], &result);
				1208	for (i = NLIMBS - 2; i >= 0; i--) {
				1209	if ((i & 1) == 0) {
				1210	top = p256_shl(&result, 29, &tmp);
				1211	} else {
				1212	top = p256_shl(&result, 28, &tmp);
				1213	}
				1214	top \|= p256_add_d(&tmp, in[i], &result);
				1215	}
				1216
				1217	p256_modmul(&SECP256r1_p, &kRInv, top, &result, out);
				1218
				1219	p256_clear(&result);
				1220	p256_clear(&tmp);
				1221	}
				1222
				1223	/* p256_base_point_mul sets {out_x,out_y} = nG, where n is < the
				1224	* order of the group. */
				1225	void p256_base_point_mul(const p256_int* n, p256_int* out_x, p256_int* out_y) {
				1226	felem x, y, z;
				1227
				1228	scalar_base_mult(x, y, z, n);
				1229
				1230	{
				1231	felem x_affine, y_affine;
				1232
				1233	point_to_affine(x_affine, y_affine, x, y, z);
				1234	from_montgomery(out_x, x_affine);
				1235	from_montgomery(out_y, y_affine);
				1236	}
				1237	}
				1238
				1239	/* p256_points_mul_vartime sets {out_x,out_y} = n1G + n2{in_x,in_y}, where
				1240	* n1 and n2 are < the order of the group.
				1241	*
				1242	* As indicated by the name, this function operates in variable time. This
				1243	* is safe because it's used for signature validation which doesn't deal
				1244	* with secrets. */
				1245	void p256_points_mul_vartime(
				1246	const p256_int* n1, const p256_int* n2, const p256_int* in_x,
				1247	const p256_int* in_y, p256_int* out_x, p256_int* out_y) {
				1248	felem x1, y1, z1, x2, y2, z2, px, py;
				1249
				1250	/* If both scalars are zero, then the result is the point at infinity. */
				1251	if (p256_is_zero(n1) != 0 && p256_is_zero(n2) != 0) {
				1252	p256_clear(out_x);
				1253	p256_clear(out_y);
				1254	return;
				1255	}
				1256
				1257	to_montgomery(px, in_x);
				1258	to_montgomery(py, in_y);
				1259	scalar_base_mult(x1, y1, z1, n1);
				1260	scalar_mult(x2, y2, z2, px, py, n2);
				1261
				1262	if (p256_is_zero(n2) != 0) {
				1263	/* If n2 == 0, then {x2,y2,z2} is zero and the result is just
				1264	* {x1,y1,z1}. */
				1265	} else if (p256_is_zero(n1) != 0) {
				1266	/* If n1 == 0, then {x1,y1,z1} is zero and the result is just
				1267	* {x2,y2,z2}. */
				1268	memcpy(x1, x2, sizeof(x2));
				1269	memcpy(y1, y2, sizeof(y2));
				1270	memcpy(z1, z2, sizeof(z2));
				1271	} else {
				1272	/* This function handles the case where {x1,y1,z1} == {x2,y2,z2}. */
				1273	point_add_or_double_vartime(x1, y1, z1, x1, y1, z1, x2, y2, z2);
				1274	}
				1275
				1276	point_to_affine(px, py, x1, y1, z1);
				1277	from_montgomery(out_x, px);
				1278	from_montgomery(out_y, py);
				1279	}