/***************************************************
 **  EMUTAG SDK EXAMPLE FOR MIKRON TAG EMULATION  **
 ***************************************************/

#include <avr/io.h>
#include <avr/pgmspace.h>
#include "main.h"
#include "macros.h"

// commands
#define C_REQA		0x26
#define C_WUPA		0x52
#define C_SEL1		0x93
#define C_SEL2		0x95
#define C_HALT		0x50
#define C_READ		0x30
#define C_WRITE		0xA2
#define C_COMPAT_WRITE	0xA0

// response constants
#define R_ATQA_H	0x00
#define R_ATQA_L	0x44
#define R_SAK		0x00
#define R_CT		0x88

// 4-bit responses are left-shifted by 4 bits, as data is shifted out left MSB first
#define R_ACK		0xA0
#define R_NAK		0x00

// states
#define S_IDLE		0
#define S_READY1	1
#define S_READY2	2
#define S_ACTIVE	3
#define S_COMPAT_WRITE	4

// block-locking bit positions
#define BL_OTP		0
#define BL_94		1
#define BL_FA		2

// configuration page addresses and page ranges
#define NUM_PAGES	20

// configuration byte addresses
#define B_LOCK_L	10
#define B_LOCK_H	11

// Simulation of write delay to internal EEPROM: 3.8 ms fixed from request
// TIMER0 is counting with prescaler = 1024 with count reset after receiving incoming message
// bitrate = F_CPU / 128, TIMER0 prescaler = 1024, ratio = 8
// 3.8 ms / (9 bit char / bitrate) = 45 byte frames
// (45 frames * 9 - 3 (turnaround bit buffer compensation)) / 8 = 50 timer counts
#define WRITE_DELAY	50

// registers contents are saved between function calls, except for shared registers
register volatile uint8_t state    asm("r6");
register volatile uint8_t lock_b0  asm("r7");
register volatile uint8_t lock_b1  asm("r8");
register volatile uint8_t lock_sw  asm("r9");
#define LOCK_SW_BIT	7

uint8_t perm_tab[8]  PROGMEM = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80}; // used by write_perm()

uint8_t mem_array[NUM_PAGES * 4]; // namely the memory storage array

/* FUNCTIONS */

// WARNING! call next two defined functions only from user_proc() to keep minimum stack depth
#define reply_std_frame(ptr, len)  reply(ptr, 0, len); return
#define reply_status(status)  reply_status_noret(status); return

void reply_status_noret(uint8_t status) __attribute__((noinline));
void reply_status_noret(uint8_t status) {
	if(status != R_ACK) state = S_IDLE;
	rx_buf[0] = status;
	ctrl_flags |= 1 << F_TX_CRC_OFF | 1 << F_TX_PAR_OFF;
	reply(rx_buf, 4, 1);
}

/*  user_init() : Called from main code once at boot time. Can be used to initialize memory.
 *              : This function is required to be present. Leave empty if not used.
 */
void user_init(void);

/*  user_pwr_cycle() : Called from main code at boot time and when carrier is lost for >100 usec. Can be used to reset tag state.
 *                   : This function is required to be present. Leave empty if not used.
 */
void user_pwr_cycle(void);

/*  user_frame_end() : Called from main code after optionally sending response frame.
 *                   : Can be used for operations longer than frame response timeout, such as writing Flash or EEPROM.
 *                   : This function is required to be present. Leave empty if not used. Keep as short as possible in locked mode.
 */
void user_frame_end(void);

/*  user_proc() : Called from main code when an incoming message has been received and stored in rx_buf[].
 *
 *  rx_bytes      = number of complete bytes stored in rx_buf[]
 *  rx_bits       = number of additional bits received if the last byte is incomplete
 *  rx_bits_total = rx_bytes * 9 + rx_bits, cached for faster comparisons
 *  __attribute__((OS_main)) tells the compiler not to save any registers on stack, as main code reloads everything
 */
void user_proc(uint8_t rx_bytes, uint8_t rx_bits, uint8_t rx_bits_total) __attribute__((OS_main));

/*  write_data() : Performs common block for WRITE and COMPATIBILITY_WRITE commands.
 *
 *  page = number of 4-byte block in memory which needs to be written with incoming data
 *  src  = offset in rx_buf[] to start transferring data from
 *  returns status code for reply
 */
uint8_t write_data(uint8_t page, uint8_t src);

/*  write_perm() : Computes whether ACK or NAK should be returned after WRITE or COMPATIBILITY_WRITE commands.
 *               : The result depends on page, lock-bits, block-locking bits, and lock switch position.
 *
 *  page = number of 4-byte block in memory for which write permissions are to be checked
 *  returns 0 if NAK needs to be returned, 1 if ACK
 */
uint8_t write_perm(uint8_t page);

/*  buf_save() : Writes bytes from rx_buf[] to mem_array[] in WRITE or COMPATIBILITY_WRITE commands.
 *             : Data is transferred according to rules defined in MIFARE Ultralight write access conditions.
 *             : E.g. page 3 (OTP) is OR'ed with incoming data in locked mode, and overwritten in unlocked mode.
 *
 *  page = number of 4-byte block in memory which needs to be written with incoming data
 *  src  = offset in rx_buf[] to start transferring data from
 */
void buf_save(uint8_t page, uint8_t src);

/*  buf_cmp() : Compares bytes and bits in rx_buf[] from offsets 2 and 10 during anticollision.
 *
 *  len  = total number of complete bytes in rx_buf[] before offset 10
 *  bits = number of additional bits in rx_buf[]
 *  returns 0 if all bits of partial UID match in rx_buf[2] and rx_buf[10], 1 otherwise
 */
uint8_t buf_cmp(uint8_t *src_ptr, uint8_t *dst_ptr, uint8_t len, uint8_t bits) __attribute__((noinline));

void user_init(void) {
	uint8_t *asm_dst, asm_len; // for macro
	
	mem_array[0x00] = 0x34;
	mem_array[0x03] = 0xBC;
	mem_array[0x09] = 0x0B;
	FILL_BUF(mem_array+16, 0xFF, 4);
	
	lock_b0 = lock_b1 = 0;
	
	user_pwr_cycle();
}

void user_pwr_cycle(void) {
	state = S_IDLE;
	ctrl_flags &= ~(1 << F_ST_HALT);
}

void user_frame_end(void) {
}

void user_proc(uint8_t rx_bytes, uint8_t rx_bits, uint8_t rx_bits_total) {
	uint8_t *asm_dst, *asm_src, asm_len; // for macro
	
	uint8_t op, arg, i, old_state;
	static uint8_t compat_wr_addr;
	
	if(ctrl_flags & 1 << F_RX_CHR_ERR) return;
	if(ctrl_flags & 1 << F_RX_OVF_ERR) return;
	
	// COMPATIBILITY_WRITE command consists of 2 request/response frames, need to save as special state
	if(state == S_COMPAT_WRITE) {
		state = S_ACTIVE; // TODO: determine using NFC Shell how state is handled with errors in 2nd part of COMPAT_WRITE command
		
		if(rx_bits) return;
		if(rx_bytes != 18) return;
		if(ctrl_flags & 1 << F_RX_PAR_ERR) return;
		if(crc_chk(rx_bytes)) return;
		
		reply_status(write_data(compat_wr_addr, 0));
	}
	
	// other commands or first part of COMPATIBILITY_WRITE command
	else {
		op = rx_buf[0];
		
		// REQA / WUPA frames
		if(rx_bits_total == 7) {
			if(op == C_REQA || op == C_WUPA) {
				if(state == S_IDLE && (op == C_WUPA || (~ctrl_flags & 1 << F_ST_HALT))) {
					// activate new lock config
					lock_b0 = mem_array[B_LOCK_L];
					lock_b1 = mem_array[B_LOCK_H];
					
					// strobe lock switch position
					asm volatile(
						"in	r24, %1		\n\t"
						"bst	r24, %2		\n\t"
						"bld	%0, %3		\n\t"
						: "=r" (lock_sw)
						: "I" (_SFR_IO_ADDR(LOCK_PORT)), "I" (LOCK_PIN),
						  "I" (LOCK_SW_BIT), "0" (lock_sw)
						: "r24"
					);
					//asm volatile("in %0, %1" : "=r" (lock_sw) : "I" (_SFR_IO_ADDR(LOCK_PORT)));
					
					state = S_READY1;
					rx_buf[0] = R_ATQA_L;
					rx_buf[1] = R_ATQA_H;
					ctrl_flags |= 1 << F_TX_CRC_OFF;
					reply_std_frame(rx_buf, 2);
				}
				else state = S_IDLE;
			}
			else return;
		}
		
		if(rx_bits_total < 18) return;
		
		// other frames
		if(ctrl_flags & 1 << F_RX_PAR_ERR) return;
		
		arg = rx_buf[1];
		
		// ANTICOLLISION / SELECT commands using anticollision frames
		if(op == C_SEL1 || op == C_SEL2) {
			if(arg == 0x70) {
				if(crc_chk(rx_bytes)) return;
				rx_bytes -= 2;
			}
			
			// Check NVB for valid range
			if(arg & 8) return;
			if(arg < 0x20 || arg > 0x70) return;
			
			// Check NVB against actual data length
			if((arg >> 4) != rx_bytes) return;
			if((arg & 0x0f) != rx_bits) return;
			
			old_state = state;
			state = S_IDLE;
			
			if(op == C_SEL1) {
				rx_buf[10] = R_CT;
				COPY_BUF(rx_buf+11, mem_array+0, 4); // extract R_CT UID0 UID1 UID2 BCC0
			}
			else {
				COPY_BUF(rx_buf+10, mem_array+4, 5); // extract UID3 UID4 UID5 UID6 BCC1
			}
			
			if(buf_cmp(rx_buf+2, rx_buf+10, rx_bytes-2, rx_bits)) return;
			
			if(old_state == S_READY2 || (old_state == S_READY1 && op == C_SEL1)) {
				state = old_state;
				if(arg == 0x70) {
					state++;
					if(op == C_SEL1) rx_buf[0] = R_SAK | 0x04; // cascade bit
					else             rx_buf[0] = R_SAK;
					reply_std_frame(rx_buf, 1);
				}
				else {
					ctrl_flags |= 1 << F_TX_CRC_OFF;
					reply(rx_buf+8+rx_bytes, rx_bits, 7-rx_bytes); return;
				}
			}
		}
		
		// other commands using standard frames
		else {
			if(rx_bits) return;
			if(crc_chk(rx_bytes)) return;
			rx_bytes -= 2;
			
			if(op == C_READ) {
				if(rx_bytes != 2) return;
				old_state = state;
				state = S_IDLE;
				
				if((!arg && old_state != S_IDLE) || (arg < NUM_PAGES && old_state == S_ACTIVE)) {
					state = S_ACTIVE;
					
					arg <<= 2;
					i = 16;
					asm_src = mem_array + arg;
					asm_dst = rx_buf;
					while(i) {
						*asm_dst++ = *asm_src++;
						if(asm_src == mem_array + (uint8_t)(NUM_PAGES << 2)) asm_src = mem_array;
						asm volatile("dec %0" : "=r" (i) : "0" (i)); // speed up copying using assembler...
					}
					
					reply_std_frame(rx_buf, 16);
				}
				else if(arg >= NUM_PAGES && old_state == S_ACTIVE) { reply_status(R_NAK); }
			}
			
			if(op == C_HALT) {
				if(rx_bytes != 2) return;
				if(arg) return;
				if(state == S_ACTIVE) ctrl_flags |= 1 << F_ST_HALT;
				state = S_IDLE; // will return anyway from state check below
			}
			
			if(state != S_ACTIVE) {
				state = S_IDLE;
				return;
			}
			
			if(op == C_WRITE || op == C_COMPAT_WRITE) {
				if(op == C_WRITE) i = 6;
				else              i = 2;
				
				if(rx_bytes != i) return;
				
				if(arg < NUM_PAGES && write_perm(arg)) {
					if(op == C_WRITE) {
						reply_status(write_data(arg, 2));
					}
					else {
						state = S_COMPAT_WRITE;
						compat_wr_addr = arg;
						reply_status(R_ACK);
					}
				}
				else { reply_status(R_NAK); }
			}
		}
	}
}

uint8_t write_data(uint8_t page, uint8_t src) {
	if(page == 1 && rx_buf[src] == R_CT) {
		// prevent storing cascade tag in UID3
		// Level 3 (10-byte) UIDs not supported
		return R_NAK;
	}
	else { buf_save(page, src); }
	
	return R_ACK;
}

uint8_t write_perm(uint8_t page) {
	if(~lock_sw & 1 << LOCK_SW_BIT) return 1;
	
	if(page > 15) { return 1; }
	
	if(page < 3) {
		if(page < 2)  { return 0; }
		else          { return 1; }
	}
	else {
		if(page < 8)  { if(lock_b0 & pgm_read_byte(&(perm_tab[page])))     return 0; }
		else          { if(lock_b1 & pgm_read_byte(&(perm_tab[page & 7]))) return 0; }
	}
	
	return 1;
}

void buf_save(uint8_t page, uint8_t src) {
	uint8_t *dst_ptr, *src_ptr;
	dst_ptr = mem_array + (uint8_t)(page << 2);
	src_ptr = rx_buf + src;
	
	uint8_t src0;
	uint8_t src1;
	uint8_t src2;
	uint8_t src3;
	
	// force load registers using X pointer to keep Z pointer free for LDD instructions
	asm volatile(
		"ld	%0, X+		\n\t"
		"ld	%1, X+		\n\t"
		"ld	%2, X+		\n\t"
		"ld	%3, X+		\n\t"
		: "=r" (src0), "=r" (src1), "=r" (src2), "=r" (src3), "=x" (src_ptr)
		: "4"  (src_ptr)
	);
	
	if(page == 2) { src0 = *(dst_ptr+0); } // do not update BCC1
	
	if(lock_sw & 1 << LOCK_SW_BIT) {
		if(page == 2)  { src1 = *(dst_ptr+1); } // do not update internal byte
		
		if(page == 19) { src3 = *(dst_ptr+3); } // do not update reserved byte
		
		// logical OR OTP pages
		if(page > 15 || (page | 1) == 3) {
			src0 |= *(dst_ptr+0);
			src1 |= *(dst_ptr+1);
			src2 |= *(dst_ptr+2);
			src3 |= *(dst_ptr+3);
		}
		
		// revert bits locked by block-locking bits
		if(page == 2) {
			if(lock_b0 & 1 << BL_OTP) {
				COPY_BIT(src2, lock_b0, 3);
			}
			
			if(lock_b0 & 1 << BL_94) {
				src2 |= lock_b0 & 0b11110000;
				src2 &= lock_b0 | 0b00001111;
				COPY_BIT(src3, lock_b1, 0);
				COPY_BIT(src3, lock_b1, 1);
			}
			
			if(lock_b0 & 1 << BL_FA) {
				src3 |= lock_b1 & 0b11111100;
				src3 &= lock_b1 | 0b00000011;
			}
		}
	}
	
	// recompute BCC0 and BCC1
	if(page == 0)  src3 = R_CT ^ src0 ^ src1 ^ src2;
	if(page == 1) *(dst_ptr+4) = src0 ^ src1 ^ src2 ^ src3;
	
	// store
	*(dst_ptr+0) = src0;
	*(dst_ptr+1) = src1;
	*(dst_ptr+2) = src2;
	*(dst_ptr+3) = src3;
	
	while(TCNT0 < WRITE_DELAY);
}

uint8_t buf_cmp(uint8_t *src_ptr, uint8_t *dst_ptr, uint8_t len, uint8_t bits) {
	uint8_t tmp1, tmp2, mask = 0xff;
	
	// speed up comparison using assembler...
	while(len) {
		asm volatile("ld %0, X+" : "=r" (tmp1), "=x" (src_ptr) : "1" (src_ptr));
		asm volatile("ld %0, Z+" : "=r" (tmp2), "=z" (dst_ptr) : "1" (dst_ptr));
		if(tmp1 != tmp2) return 1;
		asm volatile("dec %0" : "=r" (len) : "0" (len));
	}
	
	// replace default implementation of shifting by multiple bits:
	// mask <<= bits;
	// assembler code replaces default loop of 4 instructions with loop of 3 instructions
	asm volatile(
		"and	%2, %2	\n\t"
		"breq	.+6	\n\t"
		"add	%0, %0	\n\t"
		"dec	%2	\n\t"
		"brne	.-6	\n\t"
		: "=r" (mask)
		: "0"  (mask), "r" (bits)
	);
	
	if((*dst_ptr | mask) != (*src_ptr | mask)) return 1;
	
	return 0;
}

