Blame - crypto/curve25519-donna.c - chromium/src.git

blob: f141ac028b0f7bba419c218aad66275b507eb8c3 [file] [log] [blame]

[email protected]	b75d8e4	2013-03-08 23:40:42	[diff] [blame]	1	// Copyright (c) 2013 The Chromium Authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style license that can be
				3	// found in the LICENSE file.
				4
				5	/*
				6	* curve25519-donna: Curve25519 elliptic curve, public key function
				7	*
				8	* http://code.google.com/p/curve25519-donna/
				9	*
				10	* Adam Langley <[email protected]>
				11	*
				12	* Derived from public domain C code by Daniel J. Bernstein <[email protected]>
				13	*
				14	* More information about curve25519 can be found here
				15	* http://cr.yp.to/ecdh.html
				16	*
				17	* djb's sample implementation of curve25519 is written in a special assembly
				18	* language called qhasm and uses the floating point registers.
				19	*
				20	* This is, almost, a clean room reimplementation from the curve25519 paper. It
				21	* uses many of the tricks described therein. Only the crecip function is taken
				22	* from the sample implementation.
				23	*/
				24
				25	#include <string.h>
				26	#include <stdint.h>
				27
				28	typedef uint8_t u8;
				29	typedef int32_t s32;
				30	typedef int64_t limb;
				31
				32	/* Field element representation:
				33	*
				34	* Field elements are written as an array of signed, 64-bit limbs, least
				35	* significant first. The value of the field element is:
				36	* x[0] + 2^26·x[1] + x^51·x[2] + 2^102·x[3] + ...
				37	*
				38	* i.e. the limbs are 26, 25, 26, 25, ... bits wide.
				39	*/
				40
				41	/* Sum two numbers: output += in */
				42	static void fsum(limb output, const limb in) {
				43	unsigned i;
				44	for (i = 0; i < 10; i += 2) {
				45	output[0+i] = (output[0+i] + in[0+i]);
				46	output[1+i] = (output[1+i] + in[1+i]);
				47	}
				48	}
				49
				50	/* Find the difference of two numbers: output = in - output
				51	* (note the order of the arguments!)
				52	*/
				53	static void fdifference(limb output, const limb in) {
				54	unsigned i;
				55	for (i = 0; i < 10; ++i) {
				56	output[i] = (in[i] - output[i]);
				57	}
				58	}
				59
				60	/* Multiply a number my a scalar: output = in * scalar */
				61	static void fscalar_product(limb output, const limb in, const limb scalar) {
				62	unsigned i;
				63	for (i = 0; i < 10; ++i) {
				64	output[i] = in[i] * scalar;
				65	}
				66	}
				67
				68	/* Multiply two numbers: output = in2 * in
				69	*
				70	* output must be distinct to both inputs. The inputs are reduced coefficient
				71	* form, the output is not.
				72	*/
				73	static void fproduct(limb output, const limb in2, const limb *in) {
				74	output[0] = ((limb) ((s32) in2[0])) * ((s32) in[0]);
				75	output[1] = ((limb) ((s32) in2[0])) * ((s32) in[1]) +
				76	((limb) ((s32) in2[1])) * ((s32) in[0]);
				77	output[2] = 2 * ((limb) ((s32) in2[1])) * ((s32) in[1]) +
				78	((limb) ((s32) in2[0])) * ((s32) in[2]) +
				79	((limb) ((s32) in2[2])) * ((s32) in[0]);
				80	output[3] = ((limb) ((s32) in2[1])) * ((s32) in[2]) +
				81	((limb) ((s32) in2[2])) * ((s32) in[1]) +
				82	((limb) ((s32) in2[0])) * ((s32) in[3]) +
				83	((limb) ((s32) in2[3])) * ((s32) in[0]);
				84	output[4] = ((limb) ((s32) in2[2])) * ((s32) in[2]) +
				85	2 * (((limb) ((s32) in2[1])) * ((s32) in[3]) +
				86	((limb) ((s32) in2[3])) * ((s32) in[1])) +
				87	((limb) ((s32) in2[0])) * ((s32) in[4]) +
				88	((limb) ((s32) in2[4])) * ((s32) in[0]);
				89	output[5] = ((limb) ((s32) in2[2])) * ((s32) in[3]) +
				90	((limb) ((s32) in2[3])) * ((s32) in[2]) +
				91	((limb) ((s32) in2[1])) * ((s32) in[4]) +
				92	((limb) ((s32) in2[4])) * ((s32) in[1]) +
				93	((limb) ((s32) in2[0])) * ((s32) in[5]) +
				94	((limb) ((s32) in2[5])) * ((s32) in[0]);
				95	output[6] = 2 * (((limb) ((s32) in2[3])) * ((s32) in[3]) +
				96	((limb) ((s32) in2[1])) * ((s32) in[5]) +
				97	((limb) ((s32) in2[5])) * ((s32) in[1])) +
				98	((limb) ((s32) in2[2])) * ((s32) in[4]) +
				99	((limb) ((s32) in2[4])) * ((s32) in[2]) +
				100	((limb) ((s32) in2[0])) * ((s32) in[6]) +
				101	((limb) ((s32) in2[6])) * ((s32) in[0]);
				102	output[7] = ((limb) ((s32) in2[3])) * ((s32) in[4]) +
				103	((limb) ((s32) in2[4])) * ((s32) in[3]) +
				104	((limb) ((s32) in2[2])) * ((s32) in[5]) +
				105	((limb) ((s32) in2[5])) * ((s32) in[2]) +
				106	((limb) ((s32) in2[1])) * ((s32) in[6]) +
				107	((limb) ((s32) in2[6])) * ((s32) in[1]) +
				108	((limb) ((s32) in2[0])) * ((s32) in[7]) +
				109	((limb) ((s32) in2[7])) * ((s32) in[0]);
				110	output[8] = ((limb) ((s32) in2[4])) * ((s32) in[4]) +
				111	2 * (((limb) ((s32) in2[3])) * ((s32) in[5]) +
				112	((limb) ((s32) in2[5])) * ((s32) in[3]) +
				113	((limb) ((s32) in2[1])) * ((s32) in[7]) +
				114	((limb) ((s32) in2[7])) * ((s32) in[1])) +
				115	((limb) ((s32) in2[2])) * ((s32) in[6]) +
				116	((limb) ((s32) in2[6])) * ((s32) in[2]) +
				117	((limb) ((s32) in2[0])) * ((s32) in[8]) +
				118	((limb) ((s32) in2[8])) * ((s32) in[0]);
				119	output[9] = ((limb) ((s32) in2[4])) * ((s32) in[5]) +
				120	((limb) ((s32) in2[5])) * ((s32) in[4]) +
				121	((limb) ((s32) in2[3])) * ((s32) in[6]) +
				122	((limb) ((s32) in2[6])) * ((s32) in[3]) +
				123	((limb) ((s32) in2[2])) * ((s32) in[7]) +
				124	((limb) ((s32) in2[7])) * ((s32) in[2]) +
				125	((limb) ((s32) in2[1])) * ((s32) in[8]) +
				126	((limb) ((s32) in2[8])) * ((s32) in[1]) +
				127	((limb) ((s32) in2[0])) * ((s32) in[9]) +
				128	((limb) ((s32) in2[9])) * ((s32) in[0]);
				129	output[10] = 2 * (((limb) ((s32) in2[5])) * ((s32) in[5]) +
				130	((limb) ((s32) in2[3])) * ((s32) in[7]) +
				131	((limb) ((s32) in2[7])) * ((s32) in[3]) +
				132	((limb) ((s32) in2[1])) * ((s32) in[9]) +
				133	((limb) ((s32) in2[9])) * ((s32) in[1])) +
				134	((limb) ((s32) in2[4])) * ((s32) in[6]) +
				135	((limb) ((s32) in2[6])) * ((s32) in[4]) +
				136	((limb) ((s32) in2[2])) * ((s32) in[8]) +
				137	((limb) ((s32) in2[8])) * ((s32) in[2]);
				138	output[11] = ((limb) ((s32) in2[5])) * ((s32) in[6]) +
				139	((limb) ((s32) in2[6])) * ((s32) in[5]) +
				140	((limb) ((s32) in2[4])) * ((s32) in[7]) +
				141	((limb) ((s32) in2[7])) * ((s32) in[4]) +
				142	((limb) ((s32) in2[3])) * ((s32) in[8]) +
				143	((limb) ((s32) in2[8])) * ((s32) in[3]) +
				144	((limb) ((s32) in2[2])) * ((s32) in[9]) +
				145	((limb) ((s32) in2[9])) * ((s32) in[2]);
				146	output[12] = ((limb) ((s32) in2[6])) * ((s32) in[6]) +
				147	2 * (((limb) ((s32) in2[5])) * ((s32) in[7]) +
				148	((limb) ((s32) in2[7])) * ((s32) in[5]) +
				149	((limb) ((s32) in2[3])) * ((s32) in[9]) +
				150	((limb) ((s32) in2[9])) * ((s32) in[3])) +
				151	((limb) ((s32) in2[4])) * ((s32) in[8]) +
				152	((limb) ((s32) in2[8])) * ((s32) in[4]);
				153	output[13] = ((limb) ((s32) in2[6])) * ((s32) in[7]) +
				154	((limb) ((s32) in2[7])) * ((s32) in[6]) +
				155	((limb) ((s32) in2[5])) * ((s32) in[8]) +
				156	((limb) ((s32) in2[8])) * ((s32) in[5]) +
				157	((limb) ((s32) in2[4])) * ((s32) in[9]) +
				158	((limb) ((s32) in2[9])) * ((s32) in[4]);
				159	output[14] = 2 * (((limb) ((s32) in2[7])) * ((s32) in[7]) +
				160	((limb) ((s32) in2[5])) * ((s32) in[9]) +
				161	((limb) ((s32) in2[9])) * ((s32) in[5])) +
				162	((limb) ((s32) in2[6])) * ((s32) in[8]) +
				163	((limb) ((s32) in2[8])) * ((s32) in[6]);
				164	output[15] = ((limb) ((s32) in2[7])) * ((s32) in[8]) +
				165	((limb) ((s32) in2[8])) * ((s32) in[7]) +
				166	((limb) ((s32) in2[6])) * ((s32) in[9]) +
				167	((limb) ((s32) in2[9])) * ((s32) in[6]);
				168	output[16] = ((limb) ((s32) in2[8])) * ((s32) in[8]) +
				169	2 * (((limb) ((s32) in2[7])) * ((s32) in[9]) +
				170	((limb) ((s32) in2[9])) * ((s32) in[7]));
				171	output[17] = ((limb) ((s32) in2[8])) * ((s32) in[9]) +
				172	((limb) ((s32) in2[9])) * ((s32) in[8]);
				173	output[18] = 2 * ((limb) ((s32) in2[9])) * ((s32) in[9]);
				174	}
				175
				176	/* Reduce a long form to a short form by taking the input mod 2^255 - 19. */
				177	static void freduce_degree(limb *output) {
				178	/* Each of these shifts and adds ends up multiplying the value by 19. */
				179	output[8] += output[18] << 4;
				180	output[8] += output[18] << 1;
				181	output[8] += output[18];
				182	output[7] += output[17] << 4;
				183	output[7] += output[17] << 1;
				184	output[7] += output[17];
				185	output[6] += output[16] << 4;
				186	output[6] += output[16] << 1;
				187	output[6] += output[16];
				188	output[5] += output[15] << 4;
				189	output[5] += output[15] << 1;
				190	output[5] += output[15];
				191	output[4] += output[14] << 4;
				192	output[4] += output[14] << 1;
				193	output[4] += output[14];
				194	output[3] += output[13] << 4;
				195	output[3] += output[13] << 1;
				196	output[3] += output[13];
				197	output[2] += output[12] << 4;
				198	output[2] += output[12] << 1;
				199	output[2] += output[12];
				200	output[1] += output[11] << 4;
				201	output[1] += output[11] << 1;
				202	output[1] += output[11];
				203	output[0] += output[10] << 4;
				204	output[0] += output[10] << 1;
				205	output[0] += output[10];
				206	}
				207
				208	/* Reduce all coefficients of the short form input so that \|x\| < 2^26.
				209	*
				210	* On entry: \|output[i]\| < 2^62
				211	*/
				212	static void freduce_coefficients(limb *output) {
				213	unsigned i;
				214	do {
				215	output[10] = 0;
				216
				217	for (i = 0; i < 10; i += 2) {
				218	limb over = output[i] / 0x4000000l;
				219	output[i+1] += over;
				220	output[i] -= over * 0x4000000l;
				221
				222	over = output[i+1] / 0x2000000;
				223	output[i+2] += over;
				224	output[i+1] -= over * 0x2000000;
				225	}
				226	output[0] += 19 * output[10];
				227	} while (output[10]);
				228	}
				229
				230	/* A helpful wrapper around fproduct: output = in * in2.
				231	*
				232	* output must be distinct to both inputs. The output is reduced degree and
				233	* reduced coefficient.
				234	*/
				235	static void
				236	fmul(limb output, const limb in, const limb *in2) {
				237	limb t[19];
				238	fproduct(t, in, in2);
				239	freduce_degree(t);
				240	freduce_coefficients(t);
				241	memcpy(output, t, sizeof(limb) * 10);
				242	}
				243
				244	static void fsquare_inner(limb output, const limb in) {
				245	output[0] = ((limb) ((s32) in[0])) * ((s32) in[0]);
				246	output[1] = 2 * ((limb) ((s32) in[0])) * ((s32) in[1]);
				247	output[2] = 2 * (((limb) ((s32) in[1])) * ((s32) in[1]) +
				248	((limb) ((s32) in[0])) * ((s32) in[2]));
				249	output[3] = 2 * (((limb) ((s32) in[1])) * ((s32) in[2]) +
				250	((limb) ((s32) in[0])) * ((s32) in[3]));
				251	output[4] = ((limb) ((s32) in[2])) * ((s32) in[2]) +
				252	4 * ((limb) ((s32) in[1])) * ((s32) in[3]) +
				253	2 * ((limb) ((s32) in[0])) * ((s32) in[4]);
				254	output[5] = 2 * (((limb) ((s32) in[2])) * ((s32) in[3]) +
				255	((limb) ((s32) in[1])) * ((s32) in[4]) +
				256	((limb) ((s32) in[0])) * ((s32) in[5]));
				257	output[6] = 2 * (((limb) ((s32) in[3])) * ((s32) in[3]) +
				258	((limb) ((s32) in[2])) * ((s32) in[4]) +
				259	((limb) ((s32) in[0])) * ((s32) in[6]) +
				260	2 * ((limb) ((s32) in[1])) * ((s32) in[5]));
				261	output[7] = 2 * (((limb) ((s32) in[3])) * ((s32) in[4]) +
				262	((limb) ((s32) in[2])) * ((s32) in[5]) +
				263	((limb) ((s32) in[1])) * ((s32) in[6]) +
				264	((limb) ((s32) in[0])) * ((s32) in[7]));
				265	output[8] = ((limb) ((s32) in[4])) * ((s32) in[4]) +
				266	2 * (((limb) ((s32) in[2])) * ((s32) in[6]) +
				267	((limb) ((s32) in[0])) * ((s32) in[8]) +
				268	2 * (((limb) ((s32) in[1])) * ((s32) in[7]) +
				269	((limb) ((s32) in[3])) * ((s32) in[5])));
				270	output[9] = 2 * (((limb) ((s32) in[4])) * ((s32) in[5]) +
				271	((limb) ((s32) in[3])) * ((s32) in[6]) +
				272	((limb) ((s32) in[2])) * ((s32) in[7]) +
				273	((limb) ((s32) in[1])) * ((s32) in[8]) +
				274	((limb) ((s32) in[0])) * ((s32) in[9]));
				275	output[10] = 2 * (((limb) ((s32) in[5])) * ((s32) in[5]) +
				276	((limb) ((s32) in[4])) * ((s32) in[6]) +
				277	((limb) ((s32) in[2])) * ((s32) in[8]) +
				278	2 * (((limb) ((s32) in[3])) * ((s32) in[7]) +
				279	((limb) ((s32) in[1])) * ((s32) in[9])));
				280	output[11] = 2 * (((limb) ((s32) in[5])) * ((s32) in[6]) +
				281	((limb) ((s32) in[4])) * ((s32) in[7]) +
				282	((limb) ((s32) in[3])) * ((s32) in[8]) +
				283	((limb) ((s32) in[2])) * ((s32) in[9]));
				284	output[12] = ((limb) ((s32) in[6])) * ((s32) in[6]) +
				285	2 * (((limb) ((s32) in[4])) * ((s32) in[8]) +
				286	2 * (((limb) ((s32) in[5])) * ((s32) in[7]) +
				287	((limb) ((s32) in[3])) * ((s32) in[9])));
				288	output[13] = 2 * (((limb) ((s32) in[6])) * ((s32) in[7]) +
				289	((limb) ((s32) in[5])) * ((s32) in[8]) +
				290	((limb) ((s32) in[4])) * ((s32) in[9]));
				291	output[14] = 2 * (((limb) ((s32) in[7])) * ((s32) in[7]) +
				292	((limb) ((s32) in[6])) * ((s32) in[8]) +
				293	2 * ((limb) ((s32) in[5])) * ((s32) in[9]));
				294	output[15] = 2 * (((limb) ((s32) in[7])) * ((s32) in[8]) +
				295	((limb) ((s32) in[6])) * ((s32) in[9]));
				296	output[16] = ((limb) ((s32) in[8])) * ((s32) in[8]) +
				297	4 * ((limb) ((s32) in[7])) * ((s32) in[9]);
				298	output[17] = 2 * ((limb) ((s32) in[8])) * ((s32) in[9]);
				299	output[18] = 2 * ((limb) ((s32) in[9])) * ((s32) in[9]);
				300	}
				301
				302	static void
				303	fsquare(limb output, const limb in) {
				304	limb t[19];
				305	fsquare_inner(t, in);
				306	freduce_degree(t);
				307	freduce_coefficients(t);
				308	memcpy(output, t, sizeof(limb) * 10);
				309	}
				310
				311	/* Take a little-endian, 32-byte number and expand it into polynomial form */
				312	static void
				313	fexpand(limb output, const u8 input) {
				314	#define F(n,start,shift,mask) \
				315	output[n] = ((((limb) input[start + 0]) \| \
				316	((limb) input[start + 1]) << 8 \| \
				317	((limb) input[start + 2]) << 16 \| \
				318	((limb) input[start + 3]) << 24) >> shift) & mask;
				319	F(0, 0, 0, 0x3ffffff);
				320	F(1, 3, 2, 0x1ffffff);
				321	F(2, 6, 3, 0x3ffffff);
				322	F(3, 9, 5, 0x1ffffff);
				323	F(4, 12, 6, 0x3ffffff);
				324	F(5, 16, 0, 0x1ffffff);
				325	F(6, 19, 1, 0x3ffffff);
				326	F(7, 22, 3, 0x1ffffff);
				327	F(8, 25, 4, 0x3ffffff);
				328	F(9, 28, 6, 0x1ffffff);
				329	#undef F
				330	}
				331
				332	/* Take a fully reduced polynomial form number and contract it into a
				333	* little-endian, 32-byte array
				334	*/
				335	static void
				336	fcontract(u8 output, limb input) {
				337	int i;
				338
				339	do {
				340	for (i = 0; i < 9; ++i) {
				341	if ((i & 1) == 1) {
				342	while (input[i] < 0) {
				343	input[i] += 0x2000000;
				344	input[i + 1]--;
				345	}
				346	} else {
				347	while (input[i] < 0) {
				348	input[i] += 0x4000000;
				349	input[i + 1]--;
				350	}
				351	}
				352	}
				353	while (input[9] < 0) {
				354	input[9] += 0x2000000;
				355	input[0] -= 19;
				356	}
				357	} while (input[0] < 0);
				358
				359	input[1] <<= 2;
				360	input[2] <<= 3;
				361	input[3] <<= 5;
				362	input[4] <<= 6;
				363	input[6] <<= 1;
				364	input[7] <<= 3;
				365	input[8] <<= 4;
				366	input[9] <<= 6;
				367	#define F(i, s) \
				368	output[s+0] \|= input[i] & 0xff; \
				369	output[s+1] = (input[i] >> 8) & 0xff; \
				370	output[s+2] = (input[i] >> 16) & 0xff; \
				371	output[s+3] = (input[i] >> 24) & 0xff;
				372	output[0] = 0;
				373	output[16] = 0;
				374	F(0,0);
				375	F(1,3);
				376	F(2,6);
				377	F(3,9);
				378	F(4,12);
				379	F(5,16);
				380	F(6,19);
				381	F(7,22);
				382	F(8,25);
				383	F(9,28);
				384	#undef F
				385	}
				386
				387	/* Input: Q, Q', Q-Q'
				388	* Output: 2Q, Q+Q'
				389	*
				390	* x2 z3: long form
				391	* x3 z3: long form
				392	* x z: short form, destroyed
				393	* xprime zprime: short form, destroyed
				394	* qmqp: short form, preserved
				395	*/
				396	static void fmonty(limb x2, limb z2, /* output 2Q */
				397	limb x3, limb z3, /* output Q + Q' */
				398	limb x, limb z, /* input Q */
				399	limb xprime, limb zprime, /* input Q' */
				400	const limb qmqp / input Q - Q' */) {
				401	limb origx[10], origxprime[10], zzz[19], xx[19], zz[19], xxprime[19],
				402	zzprime[19], zzzprime[19], xxxprime[19];
				403
				404	memcpy(origx, x, 10 * sizeof(limb));
				405	fsum(x, z);
				406	fdifference(z, origx); // does x - z
				407
				408	memcpy(origxprime, xprime, sizeof(limb) * 10);
				409	fsum(xprime, zprime);
				410	fdifference(zprime, origxprime);
				411	fproduct(xxprime, xprime, z);
				412	fproduct(zzprime, x, zprime);
				413	freduce_degree(xxprime);
				414	freduce_coefficients(xxprime);
				415	freduce_degree(zzprime);
				416	freduce_coefficients(zzprime);
				417	memcpy(origxprime, xxprime, sizeof(limb) * 10);
				418	fsum(xxprime, zzprime);
				419	fdifference(zzprime, origxprime);
				420	fsquare(xxxprime, xxprime);
				421	fsquare(zzzprime, zzprime);
				422	fproduct(zzprime, zzzprime, qmqp);
				423	freduce_degree(zzprime);
				424	freduce_coefficients(zzprime);
				425	memcpy(x3, xxxprime, sizeof(limb) * 10);
				426	memcpy(z3, zzprime, sizeof(limb) * 10);
				427
				428	fsquare(xx, x);
				429	fsquare(zz, z);
				430	fproduct(x2, xx, zz);
				431	freduce_degree(x2);
				432	freduce_coefficients(x2);
				433	fdifference(zz, xx); // does zz = xx - zz
				434	memset(zzz + 10, 0, sizeof(limb) * 9);
				435	fscalar_product(zzz, zz, 121665);
				436	freduce_degree(zzz);
				437	freduce_coefficients(zzz);
				438	fsum(zzz, xx);
				439	fproduct(z2, zz, zzz);
				440	freduce_degree(z2);
				441	freduce_coefficients(z2);
				442	}
				443
				444	/* Calculates nQ where Q is the x-coordinate of a point on the curve
				445	*
				446	* resultx/resultz: the x coordinate of the resulting curve point (short form)
				447	* n: a little endian, 32-byte number
				448	* q: a point of the curve (short form)
				449	*/
				450	static void
				451	cmult(limb resultx, limb resultz, const u8 n, const limb q) {
				452	limb a[19] = {0}, b[19] = {1}, c[19] = {1}, d[19] = {0};
				453	limb nqpqx = a, nqpqz = b, nqx = c, nqz = d, *t;
				454	limb e[19] = {0}, f[19] = {1}, g[19] = {0}, h[19] = {1};
				455	limb nqpqx2 = e, nqpqz2 = f, nqx2 = g, nqz2 = h;
				456
				457	unsigned i, j;
				458
				459	memcpy(nqpqx, q, sizeof(limb) * 10);
				460
				461	for (i = 0; i < 32; ++i) {
				462	u8 byte = n[31 - i];
				463	for (j = 0; j < 8; ++j) {
				464	if (byte & 0x80) {
				465	fmonty(nqpqx2, nqpqz2,
				466	nqx2, nqz2,
				467	nqpqx, nqpqz,
				468	nqx, nqz,
				469	q);
				470	} else {
				471	fmonty(nqx2, nqz2,
				472	nqpqx2, nqpqz2,
				473	nqx, nqz,
				474	nqpqx, nqpqz,
				475	q);
				476	}
				477
				478	t = nqx;
				479	nqx = nqx2;
				480	nqx2 = t;
				481	t = nqz;
				482	nqz = nqz2;
				483	nqz2 = t;
				484	t = nqpqx;
				485	nqpqx = nqpqx2;
				486	nqpqx2 = t;
				487	t = nqpqz;
				488	nqpqz = nqpqz2;
				489	nqpqz2 = t;
				490
				491	byte <<= 1;
				492	}
				493	}
				494
				495	memcpy(resultx, nqx, sizeof(limb) * 10);
				496	memcpy(resultz, nqz, sizeof(limb) * 10);
				497	}
				498
				499	// -----------------------------------------------------------------------------
				500	// Shamelessly copied from djb's code
				501	// -----------------------------------------------------------------------------
				502	static void
				503	crecip(limb out, const limb z) {
				504	limb z2[10];
				505	limb z9[10];
				506	limb z11[10];
				507	limb z2_5_0[10];
				508	limb z2_10_0[10];
				509	limb z2_20_0[10];
				510	limb z2_50_0[10];
				511	limb z2_100_0[10];
				512	limb t0[10];
				513	limb t1[10];
				514	int i;
				515
				516	/* 2 */ fsquare(z2,z);
				517	/* 4 */ fsquare(t1,z2);
				518	/* 8 */ fsquare(t0,t1);
				519	/* 9 */ fmul(z9,t0,z);
				520	/* 11 */ fmul(z11,z9,z2);
				521	/* 22 */ fsquare(t0,z11);
				522	/* 2^5 - 2^0 = 31 */ fmul(z2_5_0,t0,z9);
				523
				524	/* 2^6 - 2^1 */ fsquare(t0,z2_5_0);
				525	/* 2^7 - 2^2 */ fsquare(t1,t0);
				526	/* 2^8 - 2^3 */ fsquare(t0,t1);
				527	/* 2^9 - 2^4 */ fsquare(t1,t0);
				528	/* 2^10 - 2^5 */ fsquare(t0,t1);
				529	/* 2^10 - 2^0 */ fmul(z2_10_0,t0,z2_5_0);
				530
				531	/* 2^11 - 2^1 */ fsquare(t0,z2_10_0);
				532	/* 2^12 - 2^2 */ fsquare(t1,t0);
				533	/* 2^20 - 2^10 */
				534	for (i = 2;i < 10;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
				535	/* 2^20 - 2^0 */ fmul(z2_20_0,t1,z2_10_0);
				536
				537	/* 2^21 - 2^1 */ fsquare(t0,z2_20_0);
				538	/* 2^22 - 2^2 */ fsquare(t1,t0);
				539	/* 2^40 - 2^20 */
				540	for (i = 2;i < 20;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
				541	/* 2^40 - 2^0 */ fmul(t0,t1,z2_20_0);
				542
				543	/* 2^41 - 2^1 */ fsquare(t1,t0);
				544	/* 2^42 - 2^2 */ fsquare(t0,t1);
				545	/* 2^50 - 2^10 */
				546	for (i = 2;i < 10;i += 2) { fsquare(t1,t0); fsquare(t0,t1); }
				547	/* 2^50 - 2^0 */ fmul(z2_50_0,t0,z2_10_0);
				548
				549	/* 2^51 - 2^1 */ fsquare(t0,z2_50_0);
				550	/* 2^52 - 2^2 */ fsquare(t1,t0);
				551	/* 2^100 - 2^50 */
				552	for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
				553	/* 2^100 - 2^0 */ fmul(z2_100_0,t1,z2_50_0);
				554
				555	/* 2^101 - 2^1 */ fsquare(t1,z2_100_0);
				556	/* 2^102 - 2^2 */ fsquare(t0,t1);
				557	/* 2^200 - 2^100 */
				558	for (i = 2;i < 100;i += 2) { fsquare(t1,t0); fsquare(t0,t1); }
				559	/* 2^200 - 2^0 */ fmul(t1,t0,z2_100_0);
				560
				561	/* 2^201 - 2^1 */ fsquare(t0,t1);
				562	/* 2^202 - 2^2 */ fsquare(t1,t0);
				563	/* 2^250 - 2^50 */
				564	for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
				565	/* 2^250 - 2^0 */ fmul(t0,t1,z2_50_0);
				566
				567	/* 2^251 - 2^1 */ fsquare(t1,t0);
				568	/* 2^252 - 2^2 */ fsquare(t0,t1);
				569	/* 2^253 - 2^3 */ fsquare(t1,t0);
				570	/* 2^254 - 2^4 */ fsquare(t0,t1);
				571	/* 2^255 - 2^5 */ fsquare(t1,t0);
				572	/* 2^255 - 21 */ fmul(out,t1,z11);
				573	}
				574
				575	int
				576	curve25519_donna(u8 mypublic, const u8 secret, const u8 *basepoint) {
				577	limb bp[10], x[10], z[10], zmone[10];
				578	uint8_t e[32];
				579	int i;
				580
				581	for (i = 0; i < 32; ++i) e[i] = secret[i];
				582	e[0] &= 248;
				583	e[31] &= 127;
				584	e[31] \|= 64;
				585
				586	fexpand(bp, basepoint);
				587	cmult(x, z, e, bp);
				588	crecip(zmone, z);
				589	fmul(z, x, zmone);
				590	fcontract(mypublic, z);
				591	return 0;
				592	}