%{
/*
 * Copyright (c) 2024 Sascha Wildner <swildner@gmail.com>
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 * THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <stdbool.h>
#include <stddef.h>

#include <sys/param.h>

#include "cb2c.h"
#include "basic20_parse.h"

#define	ADD_LINECOL(l, c) do {		\
	prev_line = line_number;	\
	prev_col = col_number;		\
	line_number += (l);		\
	col_number += (c);		\
} while (0)

#define	RETURN_TOK(t) do {		\
	tok_count[(t)]++;		\
	ADD_LINECOL(0, yyleng);		\
	return (t);			\
} while (0)

static void		DATA_read_values(void);
static ptrdiff_t	ID_detect_kw(char *);
static void		ID_remove_ws(char *);

static const char *keywords[] = {
	"abs", "and", "asc", "atn", "chr$", "close", "clr", "cmd", "cont",
	"cos", "data", "def", "dim", "end", "exp", "fn", "for", "fre", "get",
	"go" /* ...to/...sub */, "if", "input", "int", "left$", "len", "let",
	"list", "load", "log", "mid$", "new", "next", "not", "on", "open",
	"or", "peek", "poke", "pos", "print", "read", "rem", "restore",
	"return", "right$", "rnd", "run", "save", "sgn", "sin", "spc", "sqr",
	"step", "stop", "str$", "sys", "tab", "tan", "then", "to", "usr",
	"val", "verify", "wait"
};

static struct {
	const char *cc_str;
	unsigned char cc_code;
} pc_cc[] = {
	{ "{CTRL-A}",		0x01 },
	{ "{CTRL-B}",		0x02 },
	{ "{stop}",		0x03 },
	{ "{CTRL-D}",		0x04 },
	{ "{wht}",		0x05 },
	{ "{CTRL-F}",		0x06 },
	{ "{CTRL-G}",		0x07 },
	{ "{dish}",		0x08 },
	{ "{ensh}",		0x09 },
	{ "{CTRL-K}",		0x0b },
	{ "{CTRL-L}",		0x0c },
	{ "{swlc}",		0x0e },
	{ "{CTRL-O}",		0x0f },
	{ "{CTRL-P}",		0x10 },
	{ "{down}",		0x11 },
	{ "{rvon}",		0x12 },
	{ "{home}",		0x13 },
	{ "{del}",		0x14 },
	{ "{CTRL-U}",		0x15 },
	{ "{CTRL-V}",		0x16 },
	{ "{CTRL-W}",		0x17 },
	{ "{CTRL-X}",		0x18 },
	{ "{CTRL-Y}",		0x19 },
	{ "{CTRL-Z}",		0x1a },
	{ "{esc}",		0x1b },
	{ "{red}",		0x1c },
	{ "{rght}",		0x1d },
	{ "{grn}",		0x1e },
	{ "{blu}",		0x1f },
	{ "{orng}",		0x81 },
	{ "{f1}",		0x85 },
	{ "{f3}",		0x86 },
	{ "{f5}",		0x87 },
	{ "{f7}",		0x88 },
	{ "{f2}",		0x89 },
	{ "{f4}",		0x8a },
	{ "{f6}",		0x8b },
	{ "{f8}",		0x8c },
	{ "{sret}",		0x8d },
	{ "{swuc}",		0x8e },
	{ "{blk}",		0x90 },
	{ "{up}",		0x91 },
	{ "{rvof}",		0x92 },
	{ "{clr}",		0x93 },
	{ "{inst}",		0x94 },
	{ "{brn}",		0x95 },
	{ "{lred}",		0x96 },
	{ "{gry1}",		0x97 },
	{ "{gry2}",		0x98 },
	{ "{lgrn}",		0x99 },
	{ "{lblu}",		0x9a },
	{ "{gry3}",		0x9b },
	{ "{pur}",		0x9c },
	{ "{left}",		0x9d },
	{ "{yel}",		0x9e },
	{ "{cyn}",		0x9f },
	{ "{CBM-K}",		0xa1 },
	{ "{CBM-I}",		0xa2 },
	{ "{CBM-T}",		0xa3 },
	{ "{CBM-@}",		0xa4 },
	{ "{CBM-G}",		0xa5 },
	{ "{CBM-+}",		0xa6 },
	{ "{CBM-M}",		0xa7 },
	{ "{CBM-POUND}",	0xa8 },
	{ "{SHIFT-POUND}",	0xa9 },
	{ "{CBM-N}",		0xaa },
	{ "{CBM-Q}",		0xab },
	{ "{CBM-D}",		0xac },
	{ "{CBM-Z}",		0xad },
	{ "{CBM-S}",		0xae },
	{ "{CBM-P}",		0xaf },
	{ "{CBM-A}",		0xb0 },
	{ "{CBM-E}",		0xb1 },
	{ "{CBM-R}",		0xb2 },
	{ "{CBM-W}",		0xb3 },
	{ "{CBM-H}",		0xb4 },
	{ "{CBM-J}",		0xb5 },
	{ "{CBM-L}",		0xb6 },
	{ "{CBM-Y}",		0xb7 },
	{ "{CBM-U}",		0xb8 },
	{ "{CBM-O}",		0xb9 },
	{ "{SHIFT-@}",		0xba },
	{ "{CBM-F}",		0xbb },
	{ "{CBM-C}",		0xbc },
	{ "{CBM-X}",		0xbd },
	{ "{CBM-V}",		0xbe },
	{ "{CBM-B}",		0xbf },
	{ "{SHIFT-*}",		0xc0 },
	{ "{SHIFT-A}",		0xc1 },
	{ "{SHIFT-B}",		0xc2 },
	{ "{SHIFT-C}",		0xc3 },
	{ "{SHIFT-D}",		0xc4 },
	{ "{SHIFT-E}",		0xc5 },
	{ "{SHIFT-F}",		0xc6 },
	{ "{SHIFT-G}",		0xc7 },
	{ "{SHIFT-H}",		0xc8 },
	{ "{SHIFT-I}",		0xc9 },
	{ "{SHIFT-J}",		0xca },
	{ "{SHIFT-K}",		0xcb },
	{ "{SHIFT-L}",		0xcc },
	{ "{SHIFT-M}",		0xcd },
	{ "{SHIFT-N}",		0xce },
	{ "{SHIFT-O}",		0xcf },
	{ "{SHIFT-P}",		0xd0 },
	{ "{SHIFT-Q}",		0xd1 },
	{ "{SHIFT-R}",		0xd2 },
	{ "{SHIFT-S}",		0xd3 },
	{ "{SHIFT-T}",		0xd4 },
	{ "{SHIFT-U}",		0xd5 },
	{ "{SHIFT-V}",		0xd6 },
	{ "{SHIFT-W}",		0xd7 },
	{ "{SHIFT-X}",		0xd8 },
	{ "{SHIFT-Y}",		0xd9 },
	{ "{SHIFT-Z}",		0xda },
	{ "{SHIFT-+}",		0xdb },
	{ "{CBM--}",		0xdc },
	{ "{SHIFT--}",		0xdd },
	{ "{SHIFT-^}",		0xde },
	{ "{CBM-*}",		0xdf }
};

bool		start_of_line;
int		data_index, line_number, col_number, prev_line, prev_col;
int		tok_count[1000];
unsigned	basic_lineno;
const char 	*data_values[DATA_VALS_SIZE], *line_buf;
%}

%option noyywrap

digit		[0-9]
exp		e[+-]?{digit}*
lineno		{digit}+
int		({digit}+{exp}?|{digit}({digit}|\ )*({digit}+|{exp}))
real		{digit}*"."{digit}*{exp}?

letter		[a-z]
alnum		[a-z0-9]
id		({letter}{alnum}*[$%]?|{letter}({alnum}|\ )*({alnum}+|[$%]))

wscolon		\ *[:\n]

%%

^.*$			{
				if (line_buf != NULL)
					free(__DECONST(void *, line_buf));
				line_buf = strdup(yytext);
				start_of_line = true;
				REJECT;
			}

" "			{ ADD_LINECOL(0, 1); }

\n+			{
				ADD_LINECOL(yyleng, 0);
				tok_count[TOK_EOL]++;
				return TOK_EOL;
			}

^\ *{lineno}\ *[: ]*	{
				start_of_line = false;
				yylval.num = basic_lineno = atoi(yytext);
				col_number = yyleng;
				prev_col = 0;
				if ((uintmax_t)basic_lineno >= BASIC_LINES) {
					cb2c_diag(DIAG_ERR,
						  "invalid line number");
				} else {
					tok_count[TOK_LINENO]++;
					return TOK_LINENO;
				}
			}

abs			{ RETURN_TOK(TOK_ABS); }
and			{ RETURN_TOK(TOK_AND); }
"^"			{ RETURN_TOK(TOK_ARROWUP); }
asc			{ RETURN_TOK(TOK_ASC); }
"*"			{ RETURN_TOK(TOK_ASTERISK); }
atn			{ RETURN_TOK(TOK_ATN); }
chr"$"			{ RETURN_TOK(TOK_CHR_D); }
close			{ RETURN_TOK(TOK_CLOSE); }
clr			{ RETURN_TOK(TOK_CLR); }
cmd			{ RETURN_TOK(TOK_CMD); }
:			{ RETURN_TOK(TOK_COLON); }
,			{ RETURN_TOK(TOK_COMMA); }
cont			{ RETURN_TOK(TOK_CONT); }
cos			{ RETURN_TOK(TOK_COS); }
")"			{ RETURN_TOK(TOK_CPAREN); }

data			{
				DATA_read_values();
				RETURN_TOK(TOK_DATA);
			}

def			{ RETURN_TOK(TOK_DEF); }
dim			{ RETURN_TOK(TOK_DIM); }

end/{wscolon}		{
				toeol(false);
				RETURN_TOK(TOK_END);
			}

=			{ RETURN_TOK(TOK_EQ); }
exp			{ RETURN_TOK(TOK_EXP); }
fn			{ RETURN_TOK(TOK_FN); }
for			{ RETURN_TOK(TOK_FOR); }
fre			{ RETURN_TOK(TOK_FRE); }
get			{ RETURN_TOK(TOK_GET); }
gosub			{ RETURN_TOK(TOK_GOSUB); }
goto			{ RETURN_TOK(TOK_GOTO); }
go			{ RETURN_TOK(TOK_GO); }
">"			{ RETURN_TOK(TOK_GT); }
#			{ RETURN_TOK(TOK_HASH); }
if			{ RETURN_TOK(TOK_IF); }
input#			{ RETURN_TOK(TOK_INPUT_H); }
input			{ RETURN_TOK(TOK_INPUT); }
int			{ RETURN_TOK(TOK_INT); }
left"$"			{ RETURN_TOK(TOK_LEFT_D); }
len			{ RETURN_TOK(TOK_LEN); }
let			{ RETURN_TOK(TOK_LET); }

list\ *(-+|[0-9]+|[0-9]+\ *-+|-+\ *[0-9]+|[0-9]+\ *-+\ *[0-9]+)/{wscolon} {
				yylval.str = strdup(yytext);
				toeol(false);
				RETURN_TOK(TOK_LIST);
			}

list/{wscolon}		{
				yylval.str = strdup(yytext);
				toeol(false);
				RETURN_TOK(TOK_LIST);
			}

load			{ RETURN_TOK(TOK_LOAD); }
log			{ RETURN_TOK(TOK_LOG); }
"<"			{ RETURN_TOK(TOK_LT); }
mid"$"			{ RETURN_TOK(TOK_MID_D); }
-			{ RETURN_TOK(TOK_MINUS); }

new/{wscolon}		{
				toeol(false);
				RETURN_TOK(TOK_NEW);
			}

next			{ RETURN_TOK(TOK_NEXT); }
not			{ RETURN_TOK(TOK_NOT); }
on			{ RETURN_TOK(TOK_ON); }
"("			{ RETURN_TOK(TOK_OPAREN); }
open			{ RETURN_TOK(TOK_OPEN); }
or			{ RETURN_TOK(TOK_OR); }
peek			{ RETURN_TOK(TOK_PEEK); }

~			{	
				yylval.str = strdup(CBM_PI);
				RETURN_TOK(TOK_PI);
			}

"+"			{ RETURN_TOK(TOK_PLUS); }
poke			{ RETURN_TOK(TOK_POKE); }
pos			{ RETURN_TOK(TOK_POS); }
print#			{ RETURN_TOK(TOK_PRINT_H); }
print			{ RETURN_TOK(TOK_PRINT); }
read			{ RETURN_TOK(TOK_READ); }

rem.*			{
				yylval.str = strdup(&yytext[3]);
				RETURN_TOK(TOK_REM);
			}

restore			{ RETURN_TOK(TOK_RESTORE); }

return/{wscolon}	{
				toeol(false);
				RETURN_TOK(TOK_RETURN);
			}

right"$"		{ RETURN_TOK(TOK_RIGHT_D); }
rnd			{ RETURN_TOK(TOK_RND); }
run			{ RETURN_TOK(TOK_RUN); }
save			{ RETURN_TOK(TOK_SAVE); }
;			{ RETURN_TOK(TOK_SEMICOLON); }
sgn			{ RETURN_TOK(TOK_SGN); }
sin			{ RETURN_TOK(TOK_SIN); }
"/"			{ RETURN_TOK(TOK_SLASH); }
spc			{ RETURN_TOK(TOK_SPC); }
sqr			{ RETURN_TOK(TOK_SQR); }
step			{ RETURN_TOK(TOK_STEP); }
stop			{ RETURN_TOK(TOK_STOP); }
str"$"			{ RETURN_TOK(TOK_STR_D); }
sys			{ RETURN_TOK(TOK_SYS); }
tab			{ RETURN_TOK(TOK_TAB); }
tan			{ RETURN_TOK(TOK_TAN); }
then			{ RETURN_TOK(TOK_THEN); }
to			{ RETURN_TOK(TOK_TO); }
usr			{ RETURN_TOK(TOK_USR); }
val			{ RETURN_TOK(TOK_VAL); }
verify			{ RETURN_TOK(TOK_VERIFY); }
wait			{ RETURN_TOK(TOK_WAIT); }

{id}			{
				int i;
				ptrdiff_t rc;
				size_t len;
				rc = ID_detect_kw(yytext);
				switch (rc) {
				case -2:
					/* no keyword and ends with '[$%]' */
					yylval.str = strdup(yytext);
					ID_remove_ws(yylval.str);
					len = strlen(yylval.str);
					if (len > 3) {
						yylval.str[2] =
						    yylval.str[len - 1];
						yylval.str[3] = '\0';
					}
					RETURN_TOK(TOK_ID);
					break;
				case -1:
					/* no keyword */
					yylval.str = strdup(yytext);
					ID_remove_ws(yylval.str);
					len = strlen(yylval.str);
					if (len > 2)
						yylval.str[2] = '\0';
					RETURN_TOK(TOK_ID);
					break;
				case 0:
					/* begins with a keyword */
					REJECT;
					break;
				default:
					/* has a keyword, rc is its position */
					for (i = 0; i < yyleng - rc; i++)
						unput(yytext[yyleng - 1 - i]);
					ADD_LINECOL(0, -(yyleng - rc));
					yylval.str = strndup(yytext, rc);
					ID_remove_ws(yylval.str);
					/* FIXME: Is shortening needed here? */
					RETURN_TOK(TOK_ID);
					break;
				}
			}

{int}|"."		{
				if (start_of_line)
					REJECT;
				char *a = strdup(yytext);
				ID_remove_ws(a);
				yylval.num = atoi(a);
				free(a);
				RETURN_TOK(TOK_INTEGER);
			}

{real}			{
				yylval.str = strdup(yytext);
				RETURN_TOK(TOK_REAL);
			}

\"[^"\n]*\"?		{
				yylval.str = calloc(1, 4096);
				STRING_convert(yytext, yylval.str, yyleng);
				RETURN_TOK(TOK_STRING);
			}

.			{
				ADD_LINECOL(0, 1);
				cb2c_diag(DIAG_ERR, "invalid character '%c'",
				    yytext[0]);
			}

%%

static void
DATA_read_values(void)
{
	int c;
	char *val;
	bool in_quote = false, strip_ws = true;

	ADD_LINECOL(0, yyleng);
	yyleng = 0;
	for (;;) {
		c = input();
		if (c == ' ') {
			if (strip_ws)
				continue;
		} else {
			strip_ws = false;
		}
		if ((c == ':' && !in_quote) || (c == '\n')) {
			yytext[yyleng++] = '\0';
			unput(c);
			break;
		}
		if (c == ',' && !in_quote) {
			strip_ws = true;
			yytext[yyleng++] = '\001';
		} else {
			yytext[yyleng++] = c;
		}
		if (c == '"')
			in_quote = !in_quote;
	}
	while ((val = strsep(&yytext, "\001")) != NULL)
		data_values[data_index++] = strdup(val);
}

static ptrdiff_t
ID_detect_kw(char *id)
{
	int i;
	char *cp;
	ptrdiff_t p = PTRDIFF_MAX;

	for (i = 0; i < (int)NELEM(keywords); i++) {
		if (strlen(id) >= strlen(keywords[i])) {
			cp = strstr(id, keywords[i]);
			if (cp != NULL) {
				if ((cp - id) < p)
					p = cp - id;
			}
		}
	}
	if (p != PTRDIFF_MAX)
		return p;
	else if (yytext[yyleng - 1] == '$' || yytext[yyleng - 1] == '%')
		return -2;
	return -1;
}

static void
ID_remove_ws(char *id)
{
	char *id2 = id;

	while (*id) {
		if (*id != ' ')
			*id2++ = *id;
		id++;
	}
	*id2 = '\0';
}

void
STRING_convert(const char *src, char *dest, size_t len)
{
	bool found;
	int spos, dpos, i;
	size_t srclen, cclen;

	srclen = strlen(src);
	if ((srclen == 2 && src[1] == '"') || srclen == 1) {
		dest[0] = '\0';
		return;
	}
	spos = 1;
	dpos = 0;
	while (spos < (int)len - 1 ||
	       (spos == (int)len - 1 && src[len - 1] != '"')) {
		if (src[spos] != '{') {
			/* not a petcat control code */
			if (src[spos] == '\?' || src[spos] == '\\' ||
			    src[spos] == '\'' || src[spos] == '\"') {
				/* escape some characters */
				dest[dpos++] = '\\';
			}
			dest[dpos++] = src[spos++];
			continue;
		} else {
			if (src[spos + 1] == '$') {
				/* a 'literal' petcat control code */
				dest[dpos++] = (char)strtol(&src[spos + 2],
				    NULL, 16);
				spos += 5;
				continue;
			}
			found = false;
			for (i = 0; i < (int)NELEM(pc_cc) && !found; i++) {
				cclen = strlen(pc_cc[i].cc_str);
				if (strncmp(&src[spos], pc_cc[i].cc_str,
					    cclen) == 0) {
					/* a petcat control code with a name */
					dest[dpos++] = pc_cc[i].cc_code;
					spos += cclen;
					found = true;
				}
			}
			if (!found) {
				cb2c_diag(DIAG_ERR,
				    "unknown control code in '%s'",
				    &src[spos]);
			}
		}
	}
	dest[dpos] = '\0';
}

void
free_data_values(void)
{
	int i;

	for (i = 0; i < DATA_VALS_SIZE; i++)
		if (data_values[i] != NULL)
			free(__DECONST(void *, data_values[i]));
}

/* discard until EOL (or optionally, until the next colon) */
void
toeol(bool tocolon)
{
	char c;

	yytext[yyleng] = '\0';
	while ((c = input()) != (tocolon ? ':' : '\n') && c != '\n')
		;
	unput(c);
}
