/* This file is part of Mailfromd.             -*- c -*-
   Copyright (C) 2006-2020 Sergey Poznyakoff

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>. */

MF_BUILTIN_MODULE
MF_COND(WITH_DSPAM)

#include "srvcfg.h"
#undef HAVE_CONFIG_H
#define CONFIG_DEFAULT "/dev/null"
#define LOGDIR "/dev/null"
#include <libdspam.h>
#include "mflib/dspam.h"
#include "msg.h"

/* User parameters */
MF_VAR(dspam_user, STRING, SYM_PRECIOUS);
MF_VAR(dspam_group, STRING, SYM_PRECIOUS);
MF_VAR(dspam_config, STRING, SYM_PRECIOUS);
MF_VAR(dspam_profile, STRING, SYM_PRECIOUS);
/* Output variables */
MF_VAR(dspam_signature, STRING, SYM_PRECIOUS);
MF_VAR(dspam_probability, NUMBER);
MF_VAR(dspam_confidence, NUMBER);
MF_VAR(dspam_prec, NUMBER);
#define DEFAULT_DSPAM_PREC 3

static int _dspam_initialized;

static void
_dspam_shutdown()
{
	dspam_shutdown_driver(NULL);
}

struct transtab
{
	int trans_from;
	int trans_to;
};

static struct builtin_const_trans mode_trans[] = {
	MF_TRANS(DSM_PROCESS),
	MF_TRANS(DSM_CLASSIFY)
};

static struct builtin_const_trans flag_trans[] = {
	MF_TRANS(DSF_SIGNATURE),
	MF_TRANS(DSF_NOISE),
	MF_TRANS(DSF_WHITELIST)
};

static struct builtin_const_trans tokenizer_trans[] = {
	MF_TRANS(DSZ_WORD),
	MF_TRANS(DSZ_CHAIN),
	MF_TRANS(DSZ_SBPH),
	MF_TRANS(DSZ_OSB),
};

static struct builtin_const_trans tmod_trans[] = {
	MF_TRANS(DST_TEFT),
	MF_TRANS(DST_TOE),
	MF_TRANS(DST_TUM)
};

static struct builtin_const_trans class_trans[] = {
	MF_TRANS(DSR_ISSPAM),    
	MF_TRANS(DSR_ISINNOCENT),
	MF_TRANS(DSR_NONE)
};

static struct builtin_const_trans source_trans[] = {
	MF_TRANS(DSS_ERROR),
	MF_TRANS(DSS_CORPUS),    
	MF_TRANS(DSS_INOCULATION),
	MF_TRANS(DSS_NONE)
};

static void
ctx_cleanup(void *ptr)
{	
	DSPAM_CTX *ctx = ptr;
	dspam_destroy(ctx);
}


struct config_entry {
	int argc;
	char **argv;
#       define config_keyword argv[0]
#       define config_value argv[1]
};

static void
free_config_entry(void *data)
{
	struct config_entry *entry = data;
	mu_argcv_free(entry->argc, entry->argv);
}

static int
compare_config_entry(const void *a, const void *b)
{
	struct config_entry const *ent_a = a;
	struct config_entry const *ent_b = b;
	return strcasecmp(ent_a->config_keyword, ent_b->config_keyword);
}

struct config_entry *
config_find(mu_list_t config, const char *kw)
{
	if (config) {
		struct config_entry key, *ret;
		key.argc = 1;
		key.argv = (char **)&kw;
		if (mu_list_locate(config, &key, (void **)&ret) == 0)
			return ret;
	}
	return NULL;
}

const char *
config_find_value(mu_list_t config, const char *kw)
{
	struct config_entry *ent = config_find(config, kw);
	if (ent)
		return ent->config_value;
	return NULL;
}

static int
read_config(mu_list_t config, const char *file_name)
{
	int rc;
	mu_stream_t str, flt;
	char *buf = NULL;
	size_t size = 0, n;
	static const char *args[] = { "INLINE-COMMENT", "#", "-r" };
	
	if ((rc = mu_file_stream_create(&str, file_name, MU_STREAM_READ))) {
		mu_error(_("cannot open configuration file `%s': %s"),
			 file_name, mu_strerror(rc));
		return rc;
	}

	rc = mu_filter_create_args(&flt, str,
				   "INLINE-COMMENT",
				   MU_ARRAY_SIZE(args), args,
				   MU_FILTER_DECODE,
				   MU_STREAM_READ);
	mu_stream_unref(str);
	if (rc) {
		mu_error (_("cannot open filter stream for `%s': %s"),
			  file_name, mu_strerror (rc));
		return rc;
	}
	str = flt;

	while (mu_stream_getline(str, &buf, &size, &n) == 0 && n > 0) {
		struct config_entry *ent;
		struct mu_wordsplit ws;
		
		if (mu_wordsplit(buf, &ws, MU_WRDSF_DEFFLAGS)) {
			mu_error("mu_wordsplit: %s",
				 mu_wordsplit_strerror(&ws));
			break;
		}

		if (ws.ws_wordc) {
			ent = mu_alloc(sizeof(*ent));
			ent->argc = ws.ws_wordc;
			ent->argv = ws.ws_wordv;
			mu_list_append(config, ent);
			ws.ws_wordc = 0;
			ws.ws_wordv = NULL;
		} /* FIXME: diagnostics */
		mu_wordsplit_free(&ws);
	}
	free(buf);
	mu_stream_close(str);
	mu_stream_destroy(&str);
	return 0;
}


static void *
alloc_config()
{
	mu_list_t config;
	
	mu_list_create(&config);
	mu_list_set_destroy_item(config, free_config_entry);
	mu_list_set_comparator(config, compare_config_entry);
	return config;
}

static void
destroy_config(void *data)
{
	mu_list_t config = data;
	mu_list_destroy(&config);
}

MF_DECLARE_DATA(DSPAM_CONFIG, alloc_config, destroy_config)

MF_DSEXP_SUPPRESS([<get_config>],[<
static mu_list_t
get_config(eval_environ_t env, mu_list_t config)
{
	/* Initialize dspam library and set up global variables, if
	   needed */
	if (!_dspam_initialized) {
		const char *config_file = MF_VAR_STRING(dspam_config);
		if (config_file && config_file[0])
			read_config(config, config_file);
	
		MF_ASSERT(libdspam_init(config_find_value(config,
							  "StorageDriver"))
			  == 0,
			  mfe_failure,
			  "libdspam_init failed");
			
		dspam_init_driver(NULL);
		atexit(_dspam_shutdown);
		_dspam_initialized = 1;

		if (MF_VAR_STRING(dspam_user) == NULL)
			MF_VAR_SET_STRING(dspam_user, mf_server_user);

		if (MF_VAR_REF(dspam_prec, uint) == 0)
			MF_VAR_REF(dspam_prec, uint, DEFAULT_DSPAM_PREC);
	}

	return config;
}
>])

struct keyword_prop {
	char *name;
	int len;
	int flag;
};

#define PROP_ATTACH    1
#define PROP_ALGORITHM 2
#define PROP_TOKENIZER 3
#define PROP_PVALUE    4

static struct keyword_prop keyword_prop[] = {
	{ "IgnoreHeader", 0, PROP_ATTACH },
	{ "MySQL", 5, PROP_ATTACH },
	{ "PgSQL", 5, PROP_ATTACH },
	{ "SQLite", 6, PROP_ATTACH },
	{ "LocalMX", 0, PROP_ATTACH },
	{ "Storage", 7, PROP_ATTACH },
	{ "Processor", 9, PROP_ATTACH },
	{ "Hash", 4, PROP_ATTACH },
	{ "Algorithm", 0, PROP_ALGORITHM },
	{ "PValue", 0, PROP_PVALUE },
	{ "Tokenizer", 0, PROP_TOKENIZER },
	{ NULL }
};

static struct mu_kwd algorithm_kwd[] = {
	{ "graham", DSA_GRAHAM },
	{ "burton", DSA_BURTON },
	{ "robinson", DSA_ROBINSON },
	{ "naive", DSA_NAIVE },
	{ "chi-square", DSA_CHI_SQUARE },
	{ NULL }
};

static struct mu_kwd pvalue_kwd[] = {
	{ "robinson", DSP_ROBINSON },
	{ "markov", DSP_MARKOV },
	{ NULL }
};

static struct mu_kwd tokenizer_kwd[] = {
	{ "word", DSZ_WORD },
	{ "chain", DSZ_CHAIN },
	{ "chained", DSZ_CHAIN },
	{ "sbph", DSZ_SBPH },
	{ "osb", DSZ_OSB },
	{ NULL }
};

static void
set_context_attributes(DSPAM_CTX *ctx, mu_list_t config, const char *profile,
		       int ignore_tokenizer)
{
	mu_iterator_t itr;
	int algo = 0;
	int algo_set = 0;
	int pvalue = 0;
	int pvalue_set = 0;
	int tokenizer = 0;
	int tokenizer_set = 0;
	int n;
	
	if (!profile || !profile[0])
		profile = config_find_value(config, "DefaultProfile");

	mu_list_get_iterator(config, &itr);
	for (mu_iterator_first(itr); !mu_iterator_is_done(itr);
	     mu_iterator_next(itr)) {
		struct config_entry *ent;
		struct keyword_prop *prop;
		
		mu_iterator_current (itr, (void **)&ent);
		for (prop = keyword_prop; prop->name; prop++) {
			char *p;
			
			if ((prop->len ?
			     strncasecmp(ent->config_keyword, prop->name,
					 prop->len) :
			     strcasecmp(ent->config_keyword, prop->name))
			    == 0) {
				switch (prop->flag) {
				case PROP_ATTACH:
					dspam_addattribute(ctx,
							   ent->config_keyword,
							   ent->config_value);
					break;

				case PROP_ALGORITHM:
					algo_set = 1;
					if (mu_kwd_xlat_name_ci(algorithm_kwd,
							 ent->config_value,
								&n) == 0)
						algo |= n;
					break;

				case PROP_PVALUE:
					if (pvalue_set)
						continue;
					if (mu_kwd_xlat_name_ci(pvalue_kwd,
							 ent->config_value,
								&n) == 0) {
						pvalue = n;
						pvalue_set = 1;
					}
					break;
					
				case PROP_TOKENIZER:
					tokenizer_set = 1;
					if (mu_kwd_xlat_name_ci(tokenizer_kwd,
							 ent->config_value,
								&n) == 0)
						tokenizer |= n;
					break;
				}
			} else if (profile &&
				   (p = strchr(ent->config_keyword, '.')) &&
				   strcasecmp(p + 1, profile) == 0) {
				size_t len = p - ent->config_keyword;
				char *key = mu_alloc(len + 1);
				memcpy(key, ent->config_keyword, len);
				key[len] = 0;
				dspam_addattribute(ctx, key,
						   ent->config_value);
				free(key);
			}
		}
	}
	mu_iterator_destroy(&itr);

	if (algo_set)
		ctx->algorithms = algo | (pvalue_set ? pvalue : DSP_GRAHAM);
	
	if (!ignore_tokenizer && tokenizer_set)
		ctx->tokenizer = tokenizer;

	if ((ctx->algorithms & DSA_CHI_SQUARE) &&
	    !(ctx->algorithms & DSP_ROBINSON))
		mu_diag_output(MU_DIAG_WARNING,
			       "Chi-Square algorithm enabled with other "
			       "algorithms: false positives may ensue");
}

/* number dspam(number msg, number flags; number class_source) */
MF_DSEXP
MF_DEFUN(dspam, NUMBER, NUMBER nmsg, NUMBER mode_flags, OPTIONAL, NUMBER class_src)
{
	int rc;
	DSPAM_CTX *ctx;               	/* DSPAM Context */
	int mode;
	int flags;
	mu_message_t msg;
	mu_stream_t msgstr, instr;
	const char *msgbuf;
	size_t msgsize;
	unsigned prec;
	mu_transport_t trans[2];
	mu_list_t config = get_config(env, MF_GET_DATA);
	int tokenizer;
	
	/* Prepare message buffer */
	msg = bi_message_from_descr(env, nmsg);
	rc = mu_message_size(msg, &msgsize);
	MF_ASSERT(rc == 0,
		  mfe_failure,
 		  "mu_message_size: %s", mu_strerror(rc));
	
	rc = mu_memory_stream_create(&msgstr, MU_STREAM_RDWR);
	MF_ASSERT(rc == 0,
		  mfe_failure,
		 "mu_static_memory_stream_create: %s",
		  mu_strerror(rc));
	MF_DCL_CLEANUP(msgstr, _builtin_stream_cleanup);

	rc = mu_message_get_streamref(msg, &instr);
	MF_ASSERT(rc == 0,
		  mfe_failure,
		  "mu_message_get_streamref: %s",
		  mu_strerror(rc));
	MF_DCL_CLEANUP(instr, _builtin_stream_cleanup);

	rc = mu_stream_copy(msgstr, instr, msgsize, NULL);
	MF_ASSERT(rc == 0,
		  mfe_failure,
		  "mu_stream_copy: %s",
		  mu_strerror(rc));
	
	MF_CLEANUP(instr);

	mu_stream_ioctl(msgstr, MU_IOCTL_TRANSPORT, MU_IOCTL_OP_GET, trans);
	msgbuf = (const char*)trans[0];
	
	/* Prepare DSPAM context */
	MF_ASSERT(_builtin_const_to_c(mode_trans, MU_ARRAY_SIZE(mode_trans),
				      mode_flags & _MFL__DSM_MASK, &mode) == 0,
		  mfe_failure,
		  "bad dspam mode");
	flags = _builtin_const_to_bitmap(flag_trans, MU_ARRAY_SIZE(flag_trans),
					 mode_flags);
	
	/* Create the DSPAM context */
	ctx = dspam_create(MF_VAR_STRING(dspam_user),
			   MF_VAR_STRING(dspam_group),
			   config_find_value(config, "Home"), mode,
			   flags);
	MF_ASSERT(ctx != NULL,
		  mfe_failure,
		  "dspam_create failed");
	MF_DCL_CLEANUP(ctx, ctx_cleanup);

	/* Use graham and robinson algorithms, graham's p-values */
	ctx->algorithms = DSA_GRAHAM | DSA_BURTON | DSP_GRAHAM;

	tokenizer = mode_flags & _MFL__DSZ_MASK;
	set_context_attributes(ctx, config, MF_VAR_STRING(dspam_profile),
			       tokenizer);
	
	MF_ASSERT(dspam_attach(ctx, NULL) == 0,
		  mfe_failure,
		  "dspam_attach failed");
	
	/* Configure tokenizer */
	if (tokenizer)
		MF_ASSERT(_builtin_const_to_c(tokenizer_trans,
					      MU_ARRAY_SIZE(tokenizer_trans),
					      tokenizer, &ctx->tokenizer) == 0,
			  mfe_failure,
			  "bad dspam tokenizer");
	
	/* Set training mode */
	MF_ASSERT(_builtin_const_to_c(tmod_trans, MU_ARRAY_SIZE(tmod_trans),
			    mode_flags & _MFL__DST_MASK, &ctx->training_mode)
		  == 0,
		  mfe_failure,
		  "bad dspam training mode");
	
	/* Set up classification and source */
	if (MF_DEFINED(class_src)) {
		MF_ASSERT(_builtin_const_to_c(class_trans,
					      MU_ARRAY_SIZE(class_trans),
					      class_src & _MFL__DSR_MASK,
					      &ctx->classification) == 0,
			  mfe_failure,
			  "bad dspam classification flag");
		MF_ASSERT(_builtin_const_to_c(source_trans,
					      MU_ARRAY_SIZE(source_trans),
					      class_src & _MFL__DSS_MASK,
					      &ctx->source) == 0,
			  mfe_failure,
			  "bad dspam source flag");
	}
	
	/* Process the message */
	MF_ASSERT(dspam_process(ctx, msgbuf) == 0,
		  mfe_failure,
		  "dspam_process failed");

	rc = MF_VAR_REF(dspam_prec, uint);
	prec = 1;
	while (rc--)
		prec *= 10;
	MF_VAR_REF(dspam_probability, ulong, ctx->probability * prec);
	MF_VAR_REF(dspam_confidence, ulong, ctx->confidence * prec);
	if (flags & DSF_SIGNATURE) {
		char signame[128];
		_ds_create_signature_id(ctx, signame, sizeof(signame));
		_ds_set_signature(ctx, ctx->signature, signame);
		MF_VAR_SET_STRING(dspam_signature, signame);
	}
	MF_ASSERT(_builtin_c_to_const(class_trans,
				      MU_ARRAY_SIZE(class_trans),
				      ctx->result,
				      &rc) == 0,
		  mfe_failure,
		  "unrecognized dspam result");
	MF_CLEANUP(ctx);
	
	/* FIXME: Any additional processing? */

	MF_RETURN(rc);
}
END