/*
 * Copyright (c) 2018-2019, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

#include <stdlib.h>

#include <unit/io.h>
#include <unit/unit.h>

#include <nvgpu/channel.h>
#include <nvgpu/runlist.h>
#include <nvgpu/gk20a.h>

#include "hal/fifo/runlist_ram_gk20a.h"
#include "hal/fifo/tsg_gk20a.h"

static void setup_fifo(struct gk20a *g, unsigned long *tsg_map,
		unsigned long *ch_map, struct nvgpu_tsg *tsgs,
		struct nvgpu_channel *chs, unsigned int num_tsgs,
		unsigned int num_channels,
		struct nvgpu_runlist_info **runlists, u32 *rl_data,
		bool interleave)
{
	struct nvgpu_fifo *f = &g->fifo;
	struct nvgpu_runlist_info *runlist = runlists[0];

	/* we only use the runlist 0 here */
	runlist->mem[0].cpu_va = rl_data;

	runlist->active_tsgs = tsg_map;
	runlist->active_channels = ch_map;

	g->fifo.g = g;
	/* to debug, change this to (u64)-1 */
	g->log_mask = 0;

	/*
	 * set PTIMER src freq to its nominal frequency to avoid rounding
	 * errors when scaling timeslice.
	 */
	g->ptimer_src_freq = 31250000;

	f->tsg = tsgs;
	f->channel = chs;
	f->num_channels = num_channels;
	f->runlist_info = runlists;

	/*
	 * For testing the runlist entry order format, these simpler dual-u32
	 * entries are enough. The logic is same across chips.
	 */
	f->runlist_entry_size = 2 * sizeof(u32);
	g->ops.runlist.get_tsg_entry = gk20a_runlist_get_tsg_entry;
	g->ops.runlist.get_ch_entry = gk20a_runlist_get_ch_entry;
	g->ops.tsg.default_timeslice_us = nvgpu_tsg_default_timeslice_us;

	g->runlist_interleave = interleave;

	/* set bits in active_tsgs correspond to indices in f->tsg[...] */
	nvgpu_bitmap_set(runlist->active_tsgs, 0, num_tsgs);
	/* same; these are only used if a high enough tsg appears */
	nvgpu_bitmap_set(runlist->active_channels, 0, num_channels);

}

static void setup_tsg(struct nvgpu_tsg *tsgs, struct nvgpu_channel *chs,
		u32 i, u32 level)
{
	struct nvgpu_tsg *tsg = &tsgs[i];
	struct nvgpu_channel *ch = &chs[i];

	tsg->tsgid = i;
	nvgpu_rwsem_init(&tsg->ch_list_lock);
	nvgpu_init_list_node(&tsg->ch_list);
	tsg->num_active_channels = 1;
	tsg->interleave_level = level;

	/* 1:1 mapping for simplicity */
	ch->chid = i;
	nvgpu_list_add_tail(&ch->ch_entry, &tsg->ch_list);
}

static void setup_tsg_multich(struct nvgpu_tsg *tsgs, struct nvgpu_channel *chs,
		u32 i, u32 level, u32 ch_capacity, u32 ch_active)
{
	struct nvgpu_tsg *tsg = &tsgs[i];
	struct nvgpu_channel *ch = &chs[i + 1];
	u32 c;

	setup_tsg(tsgs, chs, i, level);
	tsg->num_active_channels = ch_active;

	/* bind the rest of the channels, onwards from the same id */
	for (c = 1; c < ch_capacity; c++) {
		ch->chid = i + c;
		nvgpu_list_add_tail(&ch->ch_entry, &tsg->ch_list);
		ch++;
	}
}

static int run_format_test(struct unit_module *m, struct nvgpu_fifo *f,
		struct nvgpu_tsg *tsg, struct nvgpu_channel *chs,
		u32 prio, u32 n_ch, u32 *rl_data,
		u32 *expect_header, u32 *expect_channel)
{
	u32 n;

	setup_tsg_multich(tsg, chs, 0, prio, 5, n_ch);

	/* entry capacity: tsg header and some channels */
	n = nvgpu_runlist_construct_locked(f, f->runlist_info[0], 0, 1 + n_ch);
	if (n != 1 + n_ch) {
		unit_return_fail(m, "number of entries mismatch %d\n", n);
	}
	if (memcmp(rl_data, expect_header, 2 * sizeof(u32)) != 0) {
		unit_err(m, "rl_data[0]=%08x", rl_data[0]);
		unit_err(m, "rl_data[1]=%08x", rl_data[1]);
		unit_err(m, "expect_header[0]=%08x", expect_header[0]);
		unit_err(m, "expect_header[1]=%08x", expect_header[1]);

		unit_return_fail(m, "tsg header mismatch\n");
	}
	if (memcmp(rl_data + 2, expect_channel, 2 * n_ch * sizeof(u32)) != 0) {
		unit_return_fail(m, "channel data mismatch\n");
	}

	return UNIT_SUCCESS;
}

static struct tsg_fmt_test_args {
	u32 channels;
	u32 chs_bitmap;
	u32 level;
	u32 timeslice;
	u32 expect_header[2];
	u32 expect_channel[10];
} tsg_fmt_tests[] = {
	/* priority 0, one channel */
	{ 1, 0x01, 0, 0, { 0x0600e000, 0 }, { 0, 0 } },
	/* priority 1, two channels */
	{ 2, 0x03, 1, 0, { 0x0a00e000, 0 }, { 0, 0, 1, 0 } },
	/* priority 2, five channels */
	{ 5, 0x1f, 2, 0, { 0x1600e000, 0 }, { 0, 0, 1, 0, 2, 0, 3, 0, 4, 0 } },
	/* priority 0, one channel, nondefault timeslice timeout */
	{ 1, 0x01, 0, 0xaa<<3, { 0x06a8e000, 0 }, { 0, 0 } },
	/* priority 0, three channels with two inactives in the middle */
	{ 3, 0x01 | 0x04 | 0x10, 0, 0, { 0x0e00e000, 0 }, { 0, 0, 2, 0, 4, 0 } },
};

/*
 * Check that inserting a single tsg of any level with a number of channels
 * works as expected.
 */
static int test_tsg_format_gen(struct unit_module *m, struct gk20a *g,
		void *args)
{
	struct nvgpu_fifo *f = &g->fifo;
	struct nvgpu_runlist_info runlist;
	struct nvgpu_runlist_info *runlists = &runlist;
	unsigned long active_tsgs_map = 0;
	unsigned long active_chs_map = 0;
	struct nvgpu_tsg tsgs[1] = {{0}};
	struct nvgpu_channel chs[5] = {{0}};
	/* header + at most five channels */
	const u32 entries_in_list_max = 1 + 5;
	u32 rl_data[2 * entries_in_list_max];
	u32 ret;
	struct tsg_fmt_test_args *test_args = args;
	(void)test_args->timeslice;

	setup_fifo(g, &active_tsgs_map, &active_chs_map, tsgs, chs, 1, 5,
			&runlists, rl_data, false);

	active_chs_map = test_args->chs_bitmap;

	if (test_args->timeslice == 0U) {
		tsgs[0].timeslice_us = g->ops.tsg.default_timeslice_us(g);
	} else {
		tsgs[0].timeslice_us = test_args->timeslice;
	}

	ret = run_format_test(m, f, &tsgs[0], chs, test_args->level,
			test_args->channels, rl_data,
			test_args->expect_header, test_args->expect_channel);
	if (ret != 0) {
		unit_return_fail(m, "bad format\n");
	}

	return UNIT_SUCCESS;
}

/* compare 1:1 tsg-channel entries against expectations */
static int check_same_simple_tsgs(struct unit_module *m, u32 *expected,
		u32 *actual, u32 n_entries)
{
	u32 i;

	for (i = 0; i < n_entries; i++) {
		u32 want = expected[i];
		/*
		 * 2 u32s per each entry, 2 entries per tsg, and the second
		 * entry of each tsg has the channel id at the first u32.
		 */
		u32 entry_off = 2 * i + 1;
		u32 got = actual[2 * entry_off];
		if (want != got) {
			unit_return_fail(m,
				"wrong entry at %u: expected %u, got %u\n",
				i, want, got);
		}
	}
	return UNIT_SUCCESS;
}

/* Common stuff for all tests below to reduce boilerplate */
static int test_common_gen(struct unit_module *m, struct gk20a *g,
		bool interleave, u32 sizelimit,
		u32 *levels, u32 levels_count,
		u32 *expected, u32 expect_count)
{
	struct nvgpu_fifo *f = &g->fifo;
	struct nvgpu_runlist_info runlist;
	struct nvgpu_runlist_info *runlists = &runlist;
	unsigned long active_tsgs_map = 0;
	unsigned long active_chs_map = 0;
	struct nvgpu_tsg tsgs[6] = {{0}};
	struct nvgpu_channel chs[6] = {{0}};
	u32 tsgs_in_list = expect_count;
	/* a tsg header and a channel entry for each */
	const u32 entries_in_list = 2 * tsgs_in_list;
	/* one entry is two u32s in these tests */
	u32 rl_data[2 * entries_in_list];
	u32 n;
	u32 i = 0;

	setup_fifo(g, &active_tsgs_map, &active_chs_map, tsgs, chs,
			levels_count, 6, &runlists, rl_data, interleave);

	for (i = 0; i < levels_count; i++) {
		setup_tsg(tsgs, chs, i, levels[i]);
	}

	n = nvgpu_runlist_construct_locked(f, &runlist, 0,
			sizelimit != 0U ? sizelimit : entries_in_list);

	if (sizelimit != 0 && sizelimit != entries_in_list) {
		/* Less than enough size is always a negative test here */
		if (n != 0xffffffffU) {
			unit_return_fail(m,
				"limit %d, expected failure, got %u\n",
				sizelimit, n);
		}
		/*
		 * Compare what we got; should be good up until the limit. For
		 * odd limit we miss the last u32 but it's better than nothing.
		 */
		return check_same_simple_tsgs(m, expected, rl_data,
				sizelimit / 2);
	}

	if (n != entries_in_list) {
		unit_return_fail(m, "expected %u entries, got %u\n",
				entries_in_list, n);
	}

	return check_same_simple_tsgs(m, expected, rl_data, tsgs_in_list);
}

static int test_flat_gen(struct unit_module *m, struct gk20a *g, u32 sizelimit)
{
	u32 levels[] = {
		/* Some random-ish order of priority levels */
		0, 1, 2, 1, 0, 2,
	};
	u32 expected[] = {
		/* High (2) indices first, then medium (1), then low (0). */
		2, 5, 1, 3, 0, 4,
	};

	return test_common_gen(m, g, false, sizelimit,
			levels, ARRAY_SIZE(levels),
			expected, ARRAY_SIZE(expected));
}

/*
 * Test the normal case that a successful construct is correct.
 */
static int test_flat(struct unit_module *m, struct gk20a *g, void *args)
{
	return test_flat_gen(m, g, 0);
}

/*
 * Just a corner case, space for just one tsg header; even the first channel
 * entry doesn't fit.
 */
static int test_flat_oversize_tiny(struct unit_module *m, struct gk20a *g,
		void *args)
{
	return test_flat_gen(m, g, 1);
}

/*
 * One tsg header with its channel fit.
 */
static int test_flat_oversize_single(struct unit_module *m, struct gk20a *g,
		void *args)
{
	return test_flat_gen(m, g, 2);
}

/*
 * The second channel would get chopped off.
 */
static int test_flat_oversize_onehalf(struct unit_module *m, struct gk20a *g,
		void *args)
{
	return test_flat_gen(m, g, 3);
}

/*
 * Two full entries fit exactly.
 */
static int test_flat_oversize_two(struct unit_module *m, struct gk20a *g,
		void *args)
{
	return test_flat_gen(m, g, 4);
}

/*
 * All but the last channel entry fit.
 */
static int test_flat_oversize_end(struct unit_module *m, struct gk20a *g,
		void *args)
{
	return test_flat_gen(m, g, 11);
}

/* Common stuff for all tests below */
static int test_interleaving_gen(struct unit_module *m, struct gk20a *g,
		u32 sizelimit,
		u32 *levels, u32 levels_count,
		u32 *expected, u32 expect_count)
{
	return test_common_gen(m, g, true, sizelimit, levels, levels_count,
			expected, expect_count);
}

/* Items in all levels, interleaved */
static int test_interleaving_gen_all(struct unit_module *m, struct gk20a *g,
		u32 sizelimit)
{
	/*
	 * Named channel ids for us humans to parse.
	 *
	 * This works such that the first two TSGs, IDs 0 and 1 (with just one
	 * channel each) are at interleave level "low" ("l0"), the next IDs 2
	 * and 3 are at level "med" ("l2"), and the last IDs 4 and 5 are at
	 * level "hi" ("l2"). Runlist construction doesn't care, so we use an
	 * easy to understand order.
	 *
	 * When debugging this test and/or the runlist code, the logs of any
	 * interleave test should follow the order in the "expected" array. We
	 * start at the highest level, so the first IDs added should be h1 and
	 * h2, i.e., 4 and 5, etc.
	 */
	u32 l1 = 0, l2 = 1;
	u32 m1 = 2, m2 = 3;
	u32 h1 = 4, h2 = 5;
	u32 levels[] = { 0, 0, 1, 1, 2, 2 };
	u32 expected[] = {
		/* Order of channel ids; partly used also for oversize tests */
		h1, h2, m1, h1, h2, m2, h1, h2, l1,
		h1, h2, m1, h1, h2, m2, h1, h2, l2,
	};

	return test_interleaving_gen(m, g, sizelimit,
			levels, ARRAY_SIZE(levels),
			expected, ARRAY_SIZE(expected));
}

static struct interleave_test_args {
	u32 sizelimit;
} interleave_tests[] = {
/* All priority items. */
	{ 0 },
/* Fail at level 2 immediately: space for just a tsg header, no ch entries. */
	{ 1 },
/* Insert both l2 entries, then fail at l1 level. */
	{ 2 * 2 },
/* Insert both l2 entries, one l1, and just one l2: fail at last l2. */
	{ (2 + 1 + 1) * 2 },
/* Stop at exactly the first l2 entry in the first l1-l0 transition. */
	{ (2 + 1 + 2 + 1) * 2 },
/* Stop at exactly the first l0 entry that doesn't fit. */
	{ (2 + 1 + 2 + 1 + 2) * 2 },
};

static int test_interleaving_gen_all_run(struct unit_module *m,
		struct gk20a *g, void *args)
{
	struct interleave_test_args *test_args = args;

	return test_interleaving_gen_all(m, g, test_args->sizelimit);
}

/*
 * Only l0 items.
 */
static int test_interleaving_l0(struct unit_module *m,
		struct gk20a *g, void *args)
{
	u32 levels[] = { 0, 0 };
	/* The channel id sequence is trivial here and in most of the below */
	u32 expected[] = { 0, 1 };

	return test_interleaving_gen(m, g, 2 * ARRAY_SIZE(expected),
			levels, ARRAY_SIZE(levels),
			expected, ARRAY_SIZE(expected));
}

/*
 * Only l1 items.
 */
static int test_interleaving_l1(struct unit_module *m,
		struct gk20a *g, void *args)
{
	u32 levels[] = { 1, 1 };
	u32 expected[] = { 0, 1 };

	return test_interleaving_gen(m, g, 2 * ARRAY_SIZE(expected),
			levels, ARRAY_SIZE(levels),
			expected, ARRAY_SIZE(expected));
}

/*
 * Only l2 items.
 */
static int test_interleaving_l2(struct unit_module *m,
		struct gk20a *g, void *args)
{
	u32 levels[] = { 2, 2 };
	u32 expected[] = { 0, 1 };

	return test_interleaving_gen(m, g, 2 * ARRAY_SIZE(expected),
			levels, ARRAY_SIZE(levels),
			expected, ARRAY_SIZE(expected));
}

/*
 * Only low and medium priority items.
 */
static int test_interleaving_l0_l1(struct unit_module *m,
		struct gk20a *g, void *args)
{
	u32 l1 = 0, l2 = 1, m1 = 2, m2 = 3;
	u32 levels[] = { 0, 0, 1, 1 };
	u32 expected[] = { m1, m2, l1, m1, m2, l2 };

	return test_interleaving_gen(m, g, 2 * ARRAY_SIZE(expected),
			levels, ARRAY_SIZE(levels),
			expected, ARRAY_SIZE(expected));
}

/*
 * Only medium and high priority items.
 */
static int test_interleaving_l1_l2(struct unit_module *m,
		struct gk20a *g, void *args)
{
	u32 m1 = 0, m2 = 1, h1 = 2, h2 = 3;
	u32 levels[] = { 1, 1, 2, 2 };
	u32 expected[] = { h1, h2, m1, h1, h2, m2 };

	return test_interleaving_gen(m, g, 2 * ARRAY_SIZE(expected),
			levels, ARRAY_SIZE(levels),
			expected, ARRAY_SIZE(expected));
}

/*
 * Only low and high priority items.
 */
static int test_interleaving_l0_l2(struct unit_module *m,
		struct gk20a *g, void *args)
{
	u32 l1 = 0, l2 = 1, h1 = 2, h2 = 3;
	u32 levels[] = { 0, 0, 2, 2 };
	u32 expected[] = { h1, h2, l1, h1, h2, l2 };

	return test_interleaving_gen(m, g, 2 * ARRAY_SIZE(expected),
			levels, ARRAY_SIZE(levels),
			expected, ARRAY_SIZE(expected));
}

struct unit_module_test nvgpu_runlist_tests[] = {
	UNIT_TEST(tsg_format_ch1, test_tsg_format_gen, &tsg_fmt_tests[0], 0),
	UNIT_TEST(tsg_format_ch2, test_tsg_format_gen, &tsg_fmt_tests[1], 0),
	UNIT_TEST(tsg_format_ch5, test_tsg_format_gen, &tsg_fmt_tests[2], 0),
	UNIT_TEST(tsg_format_ch1_timeslice, test_tsg_format_gen,
			&tsg_fmt_tests[3], 0),
	UNIT_TEST(tsg_format_ch3_inactive2, test_tsg_format_gen,
			&tsg_fmt_tests[4], 0),

	UNIT_TEST(flat, test_flat, NULL, 0),

	UNIT_TEST(flat_oversize_tiny, test_flat_oversize_tiny, NULL, 0),
	UNIT_TEST(flat_oversize_single, test_flat_oversize_single, NULL, 0),
	UNIT_TEST(flat_oversize_onehalf, test_flat_oversize_onehalf, NULL, 0),
	UNIT_TEST(flat_oversize_two, test_flat_oversize_two, NULL, 0),
	UNIT_TEST(flat_oversize_end, test_flat_oversize_end, NULL, 0),

	UNIT_TEST(interleaving,
		  test_interleaving_gen_all_run, &interleave_tests[0], 0),

	UNIT_TEST(interleaving_oversize_tiny,
		  test_interleaving_gen_all_run, &interleave_tests[1], 0),
	UNIT_TEST(interleaving_oversize_l2,
		  test_interleaving_gen_all_run, &interleave_tests[2], 0),
	UNIT_TEST(interleaving_oversize_l2_l1_l2,
		  test_interleaving_gen_all_run, &interleave_tests[3], 0),
	UNIT_TEST(interleaving_oversize_l2_l1_l2_l1,
		  test_interleaving_gen_all_run, &interleave_tests[4], 0),
	UNIT_TEST(interleaving_oversize_l2_l1_l2_l1_l2,
		  test_interleaving_gen_all_run, &interleave_tests[5], 0),

	UNIT_TEST(interleaving_l0, test_interleaving_l0, NULL, 0),
	UNIT_TEST(interleaving_l1, test_interleaving_l1, NULL, 0),
	UNIT_TEST(interleaving_l2, test_interleaving_l2, NULL, 0),
	UNIT_TEST(interleaving_l0_l1, test_interleaving_l0_l1, NULL, 0),
	UNIT_TEST(interleaving_l1_l2, test_interleaving_l1_l2, NULL, 0),
	UNIT_TEST(interleaving_l0_l2, test_interleaving_l0_l2, NULL, 0),
};

UNIT_MODULE(nvgpu_runlist, nvgpu_runlist_tests, UNIT_PRIO_NVGPU_TEST);