/*
 * Copyright (c) 2020-2022, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

#include <nvgpu/swprofile.h>
#include <nvgpu/lock.h>
#include <nvgpu/kref.h>
#include <nvgpu/debug.h>
#include <nvgpu/kmem.h>
#include <nvgpu/timers.h>
#include <nvgpu/sort.h>
#include <nvgpu/log.h>

/*
 * A simple profiler, capable of generating simple stats for a set of samples.
 */

/*
 * The sample array is a 1d array comprised of repeating rows of data. To
 * index the array as though it were a row-major matrix, we need to do some
 * simple math.
 */
static inline u32 matrix_to_linear_index(struct nvgpu_swprofiler *p,
					 u32 row, u32 col)
{
	return (row * p->psample_len) + col;
}

/*
 * Just check the samples field; it'll be allocated for an enabled profiler.
 * This is an intrisically racy call; don't rely on it to determine whether the
 * underlying pointers/fields really are initialized or not.
 *
 * However, since this doesn't take the profiler lock, if you use it under the
 * profiler lock, you can be sure the state won't change while you hold the
 * lock.
 */
bool nvgpu_swprofile_is_enabled(struct nvgpu_swprofiler *p)
{
	return p->samples != NULL;
}

void nvgpu_swprofile_initialize(struct gk20a *g,
				struct nvgpu_swprofiler *p,
				const char *col_names[])
{
	if (p->col_names != NULL) {
		/*
		 * Profiler is already initialized.
		 */
		return;
	}

	nvgpu_mutex_init(&p->lock);
	p->g = g;

	p->col_names = col_names;

	p->psample_len = 0U;
	while (col_names[p->psample_len] != NULL) {
		p->psample_len++;
	}
}

int nvgpu_swprofile_open(struct gk20a *g, struct nvgpu_swprofiler *p)
{
	int ret = 0;

	nvgpu_mutex_acquire(&p->lock);

	/*
	 * If this profiler is already opened, just take a ref and return.
	 */
	if (p->samples != NULL) {
		nvgpu_ref_get(&p->ref);
		nvgpu_mutex_release(&p->lock);
		return 0;
	}

	/*
	 * Otherwise allocate the necessary data structures, etc.
	 */
	p->samples = nvgpu_vzalloc(g,
				   PROFILE_ENTRIES * p->psample_len *
				   sizeof(*p->samples));
	if (p->samples == NULL) {
		ret = -ENOMEM;
		goto fail;
	}

	p->samples_start = nvgpu_vzalloc(g,
				   PROFILE_ENTRIES * sizeof(*p->samples_start));
	if (p->samples_start == NULL) {
		ret = -ENOMEM;
		goto fail;
	}

	nvgpu_ref_init(&p->ref);

	nvgpu_mutex_release(&p->lock);

	return 0;

fail:
	if (p->samples != NULL) {
		nvgpu_vfree(g, p->samples);
		p->samples = NULL;
	}
	nvgpu_mutex_release(&p->lock);

	return ret;
}

static void nvgpu_swprofile_free(struct nvgpu_ref *ref)
{
	struct nvgpu_swprofiler *p = container_of(ref, struct nvgpu_swprofiler, ref);

	nvgpu_vfree(p->g, p->samples);
	nvgpu_vfree(p->g, p->samples_start);
	p->samples = NULL;
	p->samples_start = NULL;
}

void nvgpu_swprofile_close(struct nvgpu_swprofiler *p)
{
	nvgpu_ref_put(&p->ref, nvgpu_swprofile_free);
}

static void nvgpu_profile_print_col_header(struct nvgpu_swprofiler *p,
					   struct nvgpu_debug_context *o)
{
	u32 i;

	for (i = 0U; i < p->psample_len; i++) {
		gk20a_debug_output(o, " %15s", p->col_names[i]);
	}
	gk20a_debug_output(o, "\n");

}

/*
 * Note: this does _not_ lock the profiler. This is a conscious choice. If we
 * do lock the profiler then there's the possibility that you get bad data due
 * to the snapshot blocking on some other user printing the contents of the
 * profiler.
 *
 * Instead, this way, it's possible that someone printing the data in the
 * profiler gets a sample that's a mix of old and new. That's not great, but
 * IMO worse than a completely bogus sample.
 *
 * Also it's really quite unlikely for this race to happen in practice as the
 * print function is executed as a result of a debugfs call.
 */
void nvgpu_swprofile_snapshot(struct nvgpu_swprofiler *p, u32 idx)
{
	u32 index;

	/*
	 * Handle two cases: the first allows calling code to simply skip
	 * any profiling by passing in a NULL profiler; see the CDE code
	 * for this. The second case is if a profiler is not "opened".
	 */
	if (p == NULL || p->samples == NULL) {
		return;
	}

	/*
	 * p->sample_index is the current row, aka sample, we are writing to.
	 * idx is the column - i.e the sub-sample.
	 */
	index = matrix_to_linear_index(p, p->sample_index, idx);

	p->samples[index] = (u64)nvgpu_current_time_ns();
}

void nvgpu_swprofile_begin_sample(struct nvgpu_swprofiler *p)
{
	nvgpu_mutex_acquire(&p->lock);

	if (p == NULL || p->samples == NULL) {
		nvgpu_mutex_release(&p->lock);
		return;
	}

	p->sample_index++;

	/* Handle wrap. */
	if (p->sample_index >= PROFILE_ENTRIES) {
		p->sample_index = 0U;
	}

	/*
	 * Reference time for subsequent subsamples in this sample.
	 */
	p->samples_start[p->sample_index] = (u64)nvgpu_current_time_ns();

	nvgpu_mutex_release(&p->lock);
}

static int profile_cmp(const void *a, const void *b)
{
	return (int)(*((const u64 *) a) - *((const u64 *) b));
}

#define PERCENTILE_WIDTH	5
#define PERCENTILE_RANGES	(100/PERCENTILE_WIDTH)

static u32 nvgpu_swprofile_build_ranges(struct nvgpu_swprofiler *p,
					u64 *storage,
					u64 *percentiles,
					u32 index_end,
					u32 index_start)
{
	u32 i;
	u32 nelem = 0U;

	/*
	 * Iterate through a column and build a temporary slice array of samples
	 * so that we can sort them without corrupting the current data.
	 *
	 * Note that we have to first convert the row/column indexes into linear
	 * indexes to access the underlying sample array.
	 */
	for (i = 0; i < PROFILE_ENTRIES; i++) {
		u32 linear_idx_start = matrix_to_linear_index(p, i, index_start);
		u32 linear_idx_end = matrix_to_linear_index(p, i, index_end);

		if (p->samples[linear_idx_end] <=
		    p->samples[linear_idx_start]) {
			/* This is an invalid element */
			continue;
		}

		storage[nelem] = p->samples[linear_idx_end] -
				 p->samples[linear_idx_start];
		nelem++;
	}

	/* sort it */
	sort(storage, nelem, sizeof(u64), profile_cmp, NULL);

	/* build ranges */
	for (i = 0; i < PERCENTILE_RANGES; i++) {
		percentiles[i] = nelem < PERCENTILE_RANGES ? 0 :
			storage[(PERCENTILE_WIDTH * (i + 1) * nelem)/100 - 1];
	}

	return nelem;
}

/*
 * Print a list of percentiles spaced by 5%. Note that the debug_context needs
 * to be special here. _Most_ print functions in NvGPU automatically add a new
 * line to the end of each print statement. This function _specifically_
 * requires that your debug print function does _NOT_ do this.
 */
void nvgpu_swprofile_print_ranges(struct gk20a *g,
				  struct nvgpu_swprofiler *p,
				  struct nvgpu_debug_context *o)
{
	u32 nelem = 0U, i, j;
	u64 *sorted_data = NULL;
	u64 *percentiles = NULL;

	nvgpu_mutex_acquire(&p->lock);

	if (p->samples == NULL) {
		gk20a_debug_output(o, "Profiler not enabled.\n");
		goto done;
	}

	sorted_data = nvgpu_vzalloc(g,
				    PROFILE_ENTRIES * p->psample_len *
				    sizeof(u64));
	percentiles = nvgpu_vzalloc(g,
				    PERCENTILE_RANGES * p->psample_len *
				    sizeof(u64));
	if (!sorted_data || !percentiles) {
		nvgpu_err(g, "vzalloc: OOM!");
		goto done;
	}

	/*
	 * Loop over each column; sort the column's data and then build
	 * percentile ranges based on that sorted data.
	 */
	for (i = 0U; i < p->psample_len; i++) {
		nelem = nvgpu_swprofile_build_ranges(p,
						   &sorted_data[i * PROFILE_ENTRIES],
						   &percentiles[i * PERCENTILE_RANGES],
						   i, 0U);
	}

	gk20a_debug_output(o, "Samples: %u\n", nelem);
	gk20a_debug_output(o, "%6s", "Perc");
	nvgpu_profile_print_col_header(p, o);

	gk20a_debug_output(o, "%6s", "----");
	for (i = 0U; i < p->psample_len; i++) {
		gk20a_debug_output(o, " %15s", "---------------");
	}
	gk20a_debug_output(o, "\n");

	/*
	 * percentiles is another matrix, but this time it's using column major indexing.
	 */
	for (i = 0U; i < PERCENTILE_RANGES; i++) {
		gk20a_debug_output(o, "%3upc ", PERCENTILE_WIDTH * (i + 1));
		for (j = 0U; j < p->psample_len; j++) {
			gk20a_debug_output(o, " %15llu",
					   percentiles[(j * PERCENTILE_RANGES) + i]);
		}
		gk20a_debug_output(o, "\n");
	}
	gk20a_debug_output(o, "\n");

done:
	nvgpu_vfree(g, sorted_data);
	nvgpu_vfree(g, percentiles);
	nvgpu_mutex_release(&p->lock);
}

/*
 * Print raw data for the profiler. Can be useful if you want to do more sophisticated
 * analysis in python or something like that.
 *
 * Note this requires a debug context that does not automatically add newlines.
 */
void nvgpu_swprofile_print_raw_data(struct gk20a *g,
				    struct nvgpu_swprofiler *p,
				    struct nvgpu_debug_context *o)
{
	u32 i, j;

	(void)g;

	nvgpu_mutex_acquire(&p->lock);

	if (p->samples == NULL) {
		gk20a_debug_output(o, "Profiler not enabled.\n");
		goto done;
	}

	gk20a_debug_output(o, "max samples: %u, sample len: %u\n",
			   PROFILE_ENTRIES, p->psample_len);

	nvgpu_profile_print_col_header(p, o);

	for (i = 0U; i < PROFILE_ENTRIES; i++) {
		for (j = 0U; j < p->psample_len; j++) {
			u32 index = matrix_to_linear_index(p, i, j);

			gk20a_debug_output(o, " %15llu",
					   p->samples[index] - p->samples_start[i]);
		}
		gk20a_debug_output(o, "\n");
	}

done:
	nvgpu_mutex_release(&p->lock);
}

/*
 * Print stats for a single column. This covers:
 *
 *   Min
 *   Max
 *   Mean
 *   Median
 *   Sigma ^ 2
 *
 * Note that the results array has to be at least 5 entries long. Storage should be
 * an array that is at least PROFILE_ENTRIES long. This is used for working out the
 * median - we need a sorted sample set for that.
 *
 * Note: this skips empty samples.
 *
 * Note: there's a limit to the sensitivity of these profiling stats. For things that
 * happen faster than the granularity of the underlying timer, you'll need to use
 * something more sophisticated. It's ok to have some zeros, but too many and you
 * won't get a very interesting picture of the data.
 */
static u32 nvgpu_swprofile_subsample_basic_stats(struct gk20a *g,
						 struct nvgpu_swprofiler *p,
						 u32 subsample,
						 u64 *results,
						 u64 *storage)
{
	u64 sum = 0U, samples = 0U;
	u64 min = U64_MAX, max = 0U;
	u64 mean, median;
	u64 sigma_2 = 0U;
	u32 i;

	(void)g;

	/*
	 * First, let's work out min, max, sum, and number of samples of data. With this we
	 * can then get the mean, median, and sigma^2.
	 */
	for (i = 0U; i < PROFILE_ENTRIES; i++) {
		u32 ss       = matrix_to_linear_index(p, i, subsample);
		u64 sample   = p->samples[ss] - p->samples_start[i];

		if (p->samples_start[i] == 0U) {
			continue;
		}

		if (sample < min) {
			min = sample;
		}
		if (sample > max) {
			max = sample;
		}

		storage[samples] = sample;
		sum += sample;
		samples += 1U;
	}

	/*
	 * If min is still U64_MAX it means that we almost certainly did not actually
	 * get a single valid sample.
	 */
	if (min == U64_MAX) {
		min = 0U;
	}

	/* With the sorted list of samples we can easily compute the median. */
	sort(storage, samples, sizeof(u64), profile_cmp, NULL);

	mean = sum / samples;
	median = storage[samples / 2];

	/* Compute the sample variance (i.e sigma squared). */
	for (i = 0U; i < samples; i++) {
		sigma_2 += storage[i] * storage[i];
	}

	/* Remember: _sample_ variance. */
	sigma_2 /= (samples - 1U);
	sigma_2 -= (mean * mean);

	results[0] = min;
	results[1] = max;
	results[2] = mean;
	results[3] = median;
	results[4] = sigma_2;

	return (u32)samples;
}

/*
 * Print the following stats for each column:
 *
 *   Min, Max, Mean, Median, Sigma^2
 */
void nvgpu_swprofile_print_basic_stats(struct gk20a *g,
				       struct nvgpu_swprofiler *p,
				       struct nvgpu_debug_context *o)
{
	u32 i;
	const char *fmt_header = "%-18s %15s %15s %15s %15s %15s\n";
	const char *fmt_output = "%-18s %15llu %15llu %15llu %15llu %15llu\n";
	u64 *storage;
	u32 samples = 0U;

	if (p->samples == NULL) {
		gk20a_debug_output(o, "Profiler not enabled.\n");
		return;
	}

	storage = nvgpu_kzalloc(g, sizeof(u64) * PROFILE_ENTRIES);
	if (storage == NULL) {
		gk20a_debug_output(o, "OOM!");
		return;
	}

	nvgpu_mutex_acquire(&p->lock);

	gk20a_debug_output(o, fmt_header,
			   "SubSample", "Min", "Max",
			   "Mean", "Median", "Sigma^2");
	gk20a_debug_output(o, fmt_header,
			   "---------", "---", "---",
			   "----", "------", "-------");

	for (i = 0U; i < p->psample_len; i++) {
		u64 results[5];

		samples = nvgpu_swprofile_subsample_basic_stats(g, p, i,
								results, storage);

		gk20a_debug_output(o, fmt_output, p->col_names[i],
				   results[0], results[1],
				   results[2], results[3], results[4]);
	}

	gk20a_debug_output(o, "Number of samples: %u\n", samples);

	nvgpu_mutex_release(&p->lock);
	nvgpu_kfree(g, storage);
}