/* noekeon_asm.S */
/*
    This file is part of the Crypto-avr-lib/microcrypt-lib.
    Copyright (C) 2008  Daniel Otte (daniel.otte@rub.de)

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/
/* 
 * noekeon assembler implementation for avr
 * author: Daniel Otte
 * email:  daniel.otte@rub.de
 * license: GPLv3
 */

#include <avr/io.h>

.macro push_all
	push r2
	push r3
	push r4
	push r5
	push r6
	push r7
	push r8
	push r9
	push r10
	push r11
	push r12
	push r13
	push r14
	push r15
	push r16
	push r17
	push r28
	push r29
.endm

.macro pop_all
	pop r29
	pop r28
	pop r17
	pop r16
	pop r15
	pop r14
	pop r13
	pop r12
	pop r11
	pop r10
	pop r9
	pop r8
	pop r7
	pop r6
	pop r5
	pop r4
	pop r3
	pop r2
	clr r1
.endm

push_all_func:
	pop r31
	pop r30
	push_all
	ijmp

pop_all_func:
    pop r31
    pop r30
    pop_all
    ijmp

.macro xchg a b
	eor \a, \b
	eor \b, \a
	eor \a, \b
.endm

.macro op32 op a b
	\op \a\()_0, \b\()_0
	\op \a\()_1, \b\()_1
	\op \a\()_2, \b\()_2
	\op \a\()_3, \b\()_3
.endm


.macro op32_4t op a b c d w x y z
	\op \a, \w
	\op \b, \x
	\op \c, \y
	\op \d, \z
.endm


.macro op32_prefix op p q a b c d w x y z
	\op \p\()\a, \q\()\w
	\op \p\()\b, \q\()\x
	\op \p\()\c, \q\()\y
	\op \p\()\d, \q\()\z
.endm

; === bigendian_rotl32 ===
; this function rotates a 32bit bigendian word n bits to the left
;  param1: the 32-bit value
;	given in r25,r24,r23,r22 (r22 is most significant)
;  param2: the 8-bit parameter giving the number of bits to rotate
;	given in r20
;  return: the rotatet 32-bit word
;   given in r25,r24,r23,r22   

bigendian_rotl32:
	/* copy high bit of r22 to carry */
	mov r1, r22
2:
	rol r1

	rol r25
	rol r24
	rol r23
	rol r22
	
	dec r20
	brne 2b
bigendian_rotl32_exit:
	clr r1
	ret
	
	
/******************************************************************************/

; === bigendian_rotl32 ===
; this function rotates a 32bit bigendian word n bits to the right
;  param1: the 32-bit value
;	given in r25,r24,r23,r22 (r22 is most significant)
;  param2: the 8-bit parameter giving the number of bits to rotate
;	given in r20
;  return: the rotatet 32-bit word
;   given in r25,r24,r23,r22   

bigendian_rotr32:
	/* copy high bit of r25 to carry */

	mov r1, r25
2:
	ror r1
	
	ror r22
	ror r23
	ror r24
	ror r25
	dec r20
	brne 2b
bigendian_rotr32_exit:
	clr r1
	ret

/******************************************************************************/
/*
void theta(uint32_t* k, uint32_t* a){
	uint32_t temp;
	temp = a[0] ^ a[2]; temp ^= ROTR32(temp, 8) ^ ROTL32(temp, 8);
	a[1] ^= temp;
	a[3] ^= temp;
	
	a[0] ^= k[0];
	a[1] ^= k[1];
	a[2] ^= k[2];
	a[3] ^= k[3];

	temp = a[1] ^ a[3]; temp ^= ROTR32(temp, 8) ^ ROTL32(temp, 8);
	a[0] ^= temp;
	a[2] ^= temp;	
}
*/

round_const: .byte 0x1B, 0x36, 0x6C, 0xD8, 0xAB, 0x4D, 0x9A, \
                   0x2F, 0x5E, 0xBC, 0x63, 0xC6, 0x97, 0x35, 0x6A, \
                   0xD4

;-- a[0]
state0_0 =  2
state0_1 =  3
state0_2 =  4
state0_3 =  5
;-- a[1]
state1_0 =  6
state1_1 =  7
state1_2 =  8
state1_3 =  9
;-- a[2]
state2_0 = 10
state2_1 = 11
state2_2 = 12
state2_3 = 13
;-- a[3]
state3_0 = 14
state3_1 = 15
state3_2 = 16
state3_3 = 17

; === theta ===
;
;  param1: the state in r2-r17
;  param2: pointer to k in X (r26,r27)
;
temp_a = 18
temp_b = 19
temp_c = 20
temp_d = 21

theta:
	/* temp = a[0] ^ a[2]; temp ^= temp>>>8 ^ temp<<<8 */
	op32_prefix mov, temp_, state0_, a,b,c,d, 0,1,2,3
	op32_prefix eor, temp_, state2_, a,b,c,d, 0,1,2,3

	mov r1, temp_a
	eor r1, temp_b
	eor r1, temp_c
	eor r1, temp_d
	
	op32_prefix eor, temp_, r, a,b,c,d, 1,1,1,1

	/* temp is know a little bit mixed c,d,a,b (if abcd is normal order) */
	/* a[1] ^= temp */
	eor state1_0, temp_c 
	eor state1_1, temp_d 
	eor state1_2, temp_a 
	eor state1_3, temp_b
	/* a[3] ^= temp */
	eor state3_0, temp_c 
	eor state3_1, temp_d 
	eor state3_2, temp_a 
	eor state3_3, temp_b
	
	/* state ^ k (X points to K) */
	ldi r28, 2
	clr r29 /* Y points to r2 aka state0_0 */
	ldi temp_a, 16
1:	
	ld r1, X+
	ld r0, Y
	eor r1, r0
	st Y+, r1
	dec temp_a
	brne 1b
	sbiw r26, 16 /* set X back to key */ 
	
	mov temp_a, state1_0
	mov temp_b, state1_1
	mov temp_c, state1_2
	mov temp_d, state1_3
	eor temp_a, state3_0
	eor temp_b, state3_1
	eor temp_c, state3_2
	eor temp_d, state3_3
	mov r1, temp_a
	eor r1, temp_b
	eor r1, temp_c
	eor r1, temp_d
	eor temp_a, r1
	eor temp_b, r1
	eor temp_c, r1
	eor temp_d, r1
	/* temp is know a little bit mixed c,d,a,b (if abcd is normal order) */
	/* a[0] ^= temp */
	eor state0_0, temp_c 
	eor state0_1, temp_d 
	eor state0_2, temp_a 
	eor state0_3, temp_b
	/* a[2] ^= temp */
	eor state2_0, temp_c 
	eor state2_1, temp_d 
	eor state2_2, temp_a 
	eor state2_3, temp_b
	
	clr r1
	ret

/******************************************************************************/
#ifndef NOEKEON_NO_ENC
; === noekeon_enc ===
;
;  param1: pointer to buffer (r24,r25)
;  param2: pointer to k (r22,r23) 
;
.global noekeon_enc
noekeon_enc:
	rcall push_all_func
	/* load state */
	movw r26, r22
	ldi r28, 2
	clr r29	/* Y points at r2 aka state0_0 */
	movw r30, r24 /* Z points at state */
	push r30
	push r31
	ldi r22, 16
	push r22 /* 16 is also the number of rounds and gets pushed here */
1:	
	ld r0, Z+
	st Y+, r0
	dec r22
	brne 1b
	/* state loaded */
	push r1 /* push round constan2 (0x00) */
	ldi r20, 0x80
	push r20 /* push round constan2 (0x00) */
	rjmp 3f
2:
	ldi r30, lo8(round_const+15)
	ldi r31, hi8(round_const+15)
	sub r30, r22
	sbci r31, 0
	clr r1
	push r1
	lpm r0, Z
	push r0
3:
	rcall round /* pops rc2 & rc1 */
	pop r22
	dec r22
	push r22
	brne 2b

	pop r22

	ldi r22, 0xD4
	eor state0_3, r22
	rcall theta

	pop r31
	pop r30
	clr r29
	ldi r28, 2
	ldi r22, 16
1:	
	ld r0, Y+
	st Z+, r0
	dec r22
	brne 1b
	
	rcall pop_all_func
	ret
#endif
/******************************************************************************/
/******************************************************************************/
#ifndef NOEKEON_NO_DEC

; === noekeon_dec ===
;
;  param1: pointer to buffer/state (r24,r25)
;  param2: pointer to k  (r22,r23) 
;
.global noekeon_dec
noekeon_dec:
	rcall push_all_func
	/* allocate 16 bytes on the stack */
	in r30, _SFR_IO_ADDR(SPL)
	in r31, _SFR_IO_ADDR(SPH)
	sbiw r30, 16 
	out  _SFR_IO_ADDR(SPH), r31
	out  _SFR_IO_ADDR(SPL), r30

	adiw r30, 1
	/* push state pointer */
	push r24
	push r25
	movw r26, r22 /* move key ptr to X */

	/* set stackkey to zero */
	ldi r22, 16
1:	st Z+, r1
	dec r22
	brne 1b
		
	/* copy key to state */
	clr r29
	ldi r28, 2
	ldi r22, 16
1:  ld r0, X+
	st Y+, r0
	dec r22
	brne 1b	

	movw r26, r30
	sbiw r26, 16 /* set X back to begining of stack key */
	rcall theta
	
	/* mov state to stackkey */
	clr r29
	ldi r28,  2
	ldi r22, 16
1:	ld r0, Y+
	st X+, r0	
	dec r22
	brne 1b
	sbiw r26, 16 /* set X back to begining of stack key */
	
	/* move data from stateptr to state */
	pop r31
	pop r30
	push r30
	push r31
	clr r29
	ldi r28,  2
	ldi r22, 16
	push r22
1:	ld r0, Z+
	st Y+, r0
	dec r22
	brne 1b	
	
;--- snip 8< ----
	
	ldi r20, 0xD4
	push r20 /* push round constant2 (0xD4) */
	push r22 /* push round constan1 (0x00) */
	rjmp 3f
2:
	ldi r30, lo8(round_const-1)
	ldi r31, hi8(round_const-1)
	clr r1
	add r30, r22
	adc r31, r1
	lpm r0, Z
	push r0
	push r1
3:
	rcall round /* pops rc2 & rc1 */
	pop r22
	dec r22
	push r22
	brne 2b
;----
	pop r22

	rcall theta
	ldi r22, 0x80
	eor state0_3, r22
	
write_state_back:	
	/* write state back */
	pop r31 /* pop state pointer */
	pop r30
	clr r29
	ldi r28, 2
	ldi r22, 16
1:	
	ld r0, Y+
	st Z+, r0
	dec r22
	brne 1b
	
	/* remove key from stack */
	in r30, _SFR_IO_ADDR(SPL)
	in r31, _SFR_IO_ADDR(SPH)
	adiw r30, 16 
	out  _SFR_IO_ADDR(SPH), r31
	out  _SFR_IO_ADDR(SPL), r30
	rcall pop_all_func
	ret
#endif
/******************************************************************************/
	
	
round:	
	pop r24
	pop r25
	pop r1
	eor state0_3, r1
	rcall theta
	pop r1
	eor state0_3, r1
	push r25
	push r24
pi_gamma_pi:
	ldi r30, pm_lo8(bigendian_rotl32)
	ldi r31, pm_hi8(bigendian_rotl32)
	rcall pi
	/* pi1 done; now gamma */
	rcall gamma_1
	/* a[0] <-> a[3] */
	xchg state0_0, state3_0
	xchg state0_1, state3_1
	xchg state0_2, state3_2
	xchg state0_3, state3_3
	/* a[2] ^= a[0] ^ a[1] ^ a[3] */
	op32 eor, state2, state0
	op32 eor, state2, state1
	op32 eor, state2, state3

	rcall gamma_1
	ldi r30, pm_lo8(bigendian_rotr32)
	ldi r31, pm_hi8(bigendian_rotr32)
	rcall pi
	ret
	
gamma_1:
	/* a[1] ^= ~(a[3]|a[2])*/
	mov r1, state3_0
	or  r1, state2_0
	com r1
	eor state1_0, r1

	mov r1, state3_1
	or  r1, state2_1
	com r1
	eor state1_1, r1

	mov r1, state3_2
	or  r1, state2_2
	com r1
	eor state1_2, r1
	
	mov r1, state3_3
	or  r1, state2_3
	com r1
	eor state1_3, r1
	
	/* a[0] ^= a[2]&a[1] */
	mov r1, state2_0
	and r1, state1_0
	eor state0_0, r1
	
	mov r1, state2_1
	and r1, state1_1
	eor state0_1, r1
	
	mov r1, state2_2
	and r1, state1_2
	eor state0_2, r1
	
	mov r1, state2_3
	and r1, state1_3
	eor state0_3, r1
	ret
	
pi:	
	/* a[1] <<<= 1*/
	mov r22, state1_0
	mov r23, state1_1
	mov r24, state1_2
	mov r25, state1_3
	ldi r20, 1
	icall
	mov state1_0, r22
	mov state1_1, r23
	mov state1_2, r24
	mov state1_3, r25
	/* a[2] <<<= 5*/
	mov r22, state2_0
	mov r23, state2_1
	mov r24, state2_2
	mov r25, state2_3
	ldi r20, 5
	icall
	mov state2_0, r22
	mov state2_1, r23
	mov state2_2, r24
	mov state2_3, r25
	/* a[3] <<<= 2*/
	mov r22, state3_0
	mov r23, state3_1
	mov r24, state3_2
	mov r25, state3_3
	ldi r20, 2
	icall
	mov state3_0, r22
	mov state3_1, r23
	mov state3_2, r24
	mov state3_3, r25
	ret

/******************************************************************************/
	
/*
void noekeon_init(void* key, noekeon_ctx_t* ctx){
	uint8_t nullv[16];
	
	memset(nullv, 0, 16);
	memcpy(ctx, key, 16);
	noekeon_enc(ctx, nullv);
}
*/

#ifndef NOEKEON_NO_INIT

.global noekeon_init
noekeon_init:
; === noekeon_init ===
;
;  param1: pointer to key (r24,r25)
;  param2: pointer to context  (r22,r23) 
;
	in r30, _SFR_IO_ADDR(SPL)
	in r31, _SFR_IO_ADDR(SPH)
	sbiw r30, 16 
	out  _SFR_IO_ADDR(SPH), r31
	out  _SFR_IO_ADDR(SPL), r30

	movw r26, r22
	adiw r30, 1
	movw r22, r30
	/* set nullv(stack) to zero */
	ldi r20, 16
1:	st Z+, r1
	dec r20
	brne 1b

	/* copy key data to ctx */
	movw r30, r24
	ldi r20, 16
1:	ld r1, Z+
	st X+, r1
	dec r20
	brne 1b
	clr r1
	
	sbiw r26, 16
	movw r24, r26
	rcall noekeon_enc
	
	in r30, _SFR_IO_ADDR(SPL)
	in r31, _SFR_IO_ADDR(SPH)
	adiw r30, 16 
	out  _SFR_IO_ADDR(SPH), r31
	out  _SFR_IO_ADDR(SPL), r30	
	ret
	
#endif