// Matrix Construct
//
// Copyright (C) Matrix Construct Developers, Authors & Contributors
// Copyright (C) 2016-2019 Jason Volk <jason@zemos.net>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice is present in all copies. The
// full license for this software is available in the LICENSE file.

static_assert
(
	__linux__,
	"This unit is only compiled for linux targets."
);

#include <RB_INC_SYS_SYSCALL_H
#include <RB_INC_SYS_IOCTL_H
#include <RB_INC_SYS_MMAN_H
#include <RB_INC_SYS_RESOURCE_H
#include <linux/perf_event.h>

#ifndef __clang__
	#define IRCD_PROF_ALWAYS_OPTIMIZE __attribute__((optimize("s"), flatten))
#else
	#define IRCD_PROF_ALWAYS_OPTIMIZE
#endif

namespace ircd::prof
{
	std::ostream &debug(std::ostream &, const ::perf_event_mmap_page &);

	template<class... args> event *
	create(group &,
	       const uint32_t &,
	       const uint64_t &,
	       args&&...);

	static event &leader(group &);
	static event *leader(group *const &);
}

struct ircd::prof::event
:instance_list<event>
{
	perf_event_attr attr;
	fs::fd fd;
	uint64_t id {0};
	size_t map_size {0};
	char *map {nullptr};
	perf_event_mmap_page *head {nullptr};
	const_buffer body;

	uint64_t rdpmc() const;
	long ioctl(const ulong &req, const long &arg = 0);
	void reset(const long & = 0);
	void enable(const long & = 0);
	void disable(const long & = 0);

	event(const int &group,
	      const uint32_t &type,
	      const uint64_t &config,
	      const bool &user,
	      const bool &kernel,
	      const bool &use_map = true);

	~event() noexcept;
};

template<>
decltype(ircd::util::instance_list<ircd::prof::event>::allocator)
ircd::util::instance_list<ircd::prof::event>::allocator
{};

template<>
decltype(ircd::util::instance_list<ircd::prof::event>::list)
ircd::util::instance_list<ircd::prof::event>::list
{
	allocator
};

//
// prof
//

void
ircd::prof::reset(group &group)
{
	leader(group).reset(PERF_IOC_FLAG_GROUP);
}

void
IRCD_PROF_ALWAYS_OPTIMIZE
ircd::prof::start(group &group)
{
	leader(group).enable(PERF_IOC_FLAG_GROUP);
}

void
IRCD_PROF_ALWAYS_OPTIMIZE
ircd::prof::stop(group &group)
{
	auto &leader(*group.front());
	leader.disable(PERF_IOC_FLAG_GROUP);
	assert(!group.empty());
}

ircd::prof::event &
IRCD_PROF_ALWAYS_OPTIMIZE
ircd::prof::leader(group &group)
{
	assert(!group.empty() && group.front());
	return *group.front();
}

ircd::prof::event *
ircd::prof::leader(group *const &group)
{
	return group && !group->empty()?
		group->front().get():
		nullptr;
}

template<class... args>
ircd::prof::event *
ircd::prof::create(group &group,
                   const uint32_t &type,
                   const uint64_t &config,
                   args&&... a)
try
{
	const int gfd
	{
		leader(&group)? leader(group).fd : -1
	};

	group.emplace_back(std::make_unique<event>
	(
		gfd, type, config, std::forward<args>(a)...
	));

	return group.back().get();
}
catch(const std::exception &e)
{
	log::dwarning
	{
		"Failed to create event type:%u config:%lu :%s",
		type,
		config,
		e.what()
	};

	return nullptr;
}

///////////////////////////////////////////////////////////////////////////////
//
// prof/psi.h
//

decltype(ircd::prof::psi::supported)
ircd::prof::psi::supported
{
	info::kernel_version[0] > 4 ||
	(info::kernel_version[0] >= 4 && info::kernel_version[1] >= 20)
};

decltype(ircd::prof::psi::path)
ircd::prof::psi::path
{
	"/proc/pressure/cpu",
	"/proc/pressure/memory",
	"/proc/pressure/io",
};

decltype(ircd::prof::psi::cpu)
ircd::prof::psi::cpu
{
	"cpu"
};

decltype(ircd::prof::psi::mem)
ircd::prof::psi::mem
{
	"memory"
};

decltype(ircd::prof::psi::io)
ircd::prof::psi::io
{
	"io"
};

//
// prof::psi::metric::refresh
//

ircd::prof::psi::file &
ircd::prof::psi::wait(const vector_view<const trigger> &cmd)
try
{
	static const size_t max{3};
	size_t trig_num {0}, trig_idx[max]
	{
		size_t(-1),
		size_t(-1),
		size_t(-1),
	};

	// Associate all of the trigger inputs (cmd) with one of the files; the
	// cmds can be arranged any way and may not be for all files or any.
	for(size_t i(0); i < cmd.size(); ++i)
	{
		const auto it
		{
			std::find_if(begin(path), end(path), [&cmd, &i]
			(const auto &name)
			{
				return lstrip(name, "/proc/pressure/") == cmd[i].file.name;
			})
		};

		const auto pos
		{
			std::distance(begin(path), it)
		};

		if(unlikely(size_t(pos) >= max))
			throw error
			{
				"%s does not exist",
				cmd[i].file.name,
			};

		trig_idx[pos] = i;
		trig_num++;
	}

	const fs::fd::opts opts
	{
		std::ios::in | std::ios::out
	};

	// Open the fd's; if triggers were given we don't open files that were
	// not included in the cmd vector; otherwise we open all files.
	const fs::fd fd[max]
	{
		!trig_num || trig_idx[0] < max?
			fs::fd{path[0], opts}:
			fs::fd{},

		!trig_num || trig_idx[1] < max?
			fs::fd{path[1], opts}:
			fs::fd{},

		!trig_num || trig_idx[2] < max?
			fs::fd{path[2], opts}:
			fs::fd{},
	};

	// Write all triggers to their respective file
	for(size_t i(0); i < max; ++i)
	{
		if(trig_idx[i] >= max)
			continue;

		const auto &trig(cmd[trig_idx[i]]); try
		{
			// psi_write() in the kernel wants a write length of one greater
			// than the length of the string, but it places a \0 in its own
			// buffer unconditionally. This is noteworthy because our string
			// may not be null terminated and this length requirement smells.
			assert(trig.file.name == lstrip(path[i], "/proc/pressure/"));
			syscall(::write, fd[i], trig.string.c_str(), size(trig.string) + 1);
		}
		catch(const ctx::interrupted &)
		{
			throw;
		}
		catch(const std::exception &e)
		{
			log::error
			{
				"Failed to set pressure stall trigger [%s] on /proc/pressure/%s :%s",
				trig.string,
				trig.file.name,
				e.what(),
			};

			throw;
		}
	}

	// Yield ircd::ctx until fd[n] has a result.
	const size_t n
	{
		fs::select(fd)
	};

	switch(n)
	{
		case 0:  return cpu;
		case 1:  return mem;
		case 2:  return io;
		default:
			always_assert(false);
			__builtin_unreachable();
	}
}
catch(const ctx::interrupted &)
{
	throw;
}
catch(const std::exception &e)
{
	log::error
	{
		"Failed to poll pressure stall information :%s",
		e.what(),
	};

	throw;
}

bool
ircd::prof::psi::refresh(file &file)
noexcept try
{
	if(!supported)
		return false;

	if(unlikely(!file.name))
		return false;

	thread_local unique_mutable_buffer path_buf
	{
		fs::PATH_MAX_LEN
	};

	const auto &path
	{
		fs::path(path_buf, vector_view<const string_view>
		{
			"/proc/pressure"_sv, file.name
		})
	};

	// Copy value into userspace
	char buf[256];
	fs::read_opts opts;
	opts.aio = false; // can't read /proc through AIO
	opts.all = false; // don't need posix read-loop; make one read(2) only.
	const auto &result
	{
		fs::read(path, buf, opts)
	};

	tokens(result, '\n', [&file] // Read each line
	(const string_view &line)
	{
		const auto &[type, vals]
		{
			split(line, ' ')
		};

		// The first token tells us what the metric is; we have allocated
		// results for the following
		if(type != "full" && type != "some")
			return;

		auto &metric
		{
			type == "full"?
				file.full:
				file.some
		};

		size_t i(0);
		tokens(vals, ' ', [&file, &metric, &i] // Read each key=value pair
		(const string_view &key_val)
		{
			const auto &[key, val]
			{
				split(key_val, '=')
			};

			if(key == "total")
			{
				const auto total(lex_cast<microseconds>(val));
				metric.stall.relative = total - metric.stall.total;
				metric.stall.window = duration_cast<microseconds>(now<system_point>() - file.sampled);
				metric.stall.pct = metric.stall.window.count()?
					metric.stall.relative.count() / double(metric.stall.window.count()):
					0.0;
				metric.stall.pct *= 100;
				metric.stall.total = total;
				return;
			}
			else if(startswith(key, "avg") && i < metric.avg.size())
			{
				metric.avg.at(i).window = lex_cast<seconds>(lstrip(key, "avg"));
				metric.avg.at(i).pct = lex_cast<float>(val);
				++i;
			}
		});
	});

	file.sampled = ircd::now<system_point>();
	return true;
}
catch(const std::exception &e)
{
	log::error
	{
		"Failed to refresh pressure stall information '%s' :%s",
		file.name,
		e.what(),
	};

	return false;
}

///////////////////////////////////////////////////////////////////////////////
//
// prof/instructions.h
//

ircd::prof::instructions::instructions()
{
	if(!create(this->group, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, true, false))
		throw error
		{
			"Cannot sample instruction counter."
		};

	reset(this->group);
	start(this->group);
}

ircd::prof::instructions::~instructions()
noexcept
{
}

const uint64_t &
IRCD_PROF_ALWAYS_OPTIMIZE
ircd::prof::instructions::sample()
{
	retired = prof::leader(group).rdpmc();
	return retired;
}

const uint64_t &
ircd::prof::instructions::at()
const
{
	return retired;
}

//
// time_*() suite
//

uint64_t
ircd::prof::time_thrd()
{
	struct ::timespec tv;
	syscall(::clock_gettime, CLOCK_THREAD_CPUTIME_ID, &tv);
	return ulong(tv.tv_sec) * 1000000000UL + tv.tv_nsec;
}

uint64_t
ircd::prof::time_proc()
{
	struct ::timespec tv;
	syscall(::clock_gettime, CLOCK_PROCESS_CPUTIME_ID, &tv);
	return ulong(tv.tv_sec) * 1000000000UL + tv.tv_nsec;
}

///////////////////////////////////////////////////////////////////////////////
//
// prof::system
//

decltype(ircd::prof::system::group)
ircd::prof::system::group;

ircd::prof::system
ircd::prof::operator-(const system &a,
                      const system &b)
{
	system ret(a);
	ret -= b;
	return ret;
}

ircd::prof::system
ircd::prof::operator+(const system &a,
                      const system &b)
{
	system ret(a);
	ret += b;
	return ret;
}

ircd::prof::system &
ircd::prof::operator-=(system &a,
                       const system &b)
{
	for(size_t i(0); i < a.size(); ++i)
		for(size_t j(0); j < a[i].size(); ++j)
			a[i][j] -= b[i][j];

	return a;
}

ircd::prof::system &
ircd::prof::operator+=(system &a,
                       const system &b)
{
	for(size_t i(0); i < a.size(); ++i)
		for(size_t j(0); j < a[i].size(); ++j)
			a[i][j] += b[i][j];

	return a;
}

ircd::prof::system &
ircd::prof::hotsample(system &s)
noexcept
{
	thread_local char buf[1024];

	auto &leader
	{
		prof::leader(system::group)
	};

	const const_buffer read
	{
		buf,  size_t(syscall(::read, int(leader.fd), buf, sizeof(buf)))
	};

	for_each(read, [&s]
	(const type &type, const uint64_t &val)
	{
		auto &r0
		{
			s.at(size_t(type.counter))
		};

		auto &r1
		{
			r0.at(size_t(type.dpl))
		};

		r1 = val;
	});

	return s;
}

void
ircd::prof::for_each(const const_buffer &buf,
                     const read_closure &closure)
{
	struct head
	{
		uint64_t nr, te, tr;
	}
	const *const &head
	{
		reinterpret_cast<const struct head *>(data(buf))
	};

	struct body
	{
		uint64_t val, id;
	}
	const *const &body
	{
		reinterpret_cast<const struct body *>(data(buf) + sizeof(struct head))
	};

	// Start with the pseudo-results; these should always be the same for
	// non-hw profiling, so the DPL is meaningless.
	closure(type{dpl::KERNEL, uint8_t(-1)}, head->te);
	closure(type{dpl::USER, uint8_t(-1)}, head->tr);

	// Iterate the result list
	for(size_t i(0); i < head->nr; ++i)
		for(auto it(begin(event::list)); it != end(event::list); ++it)
			if((*it)->id == body[i].id)
				return closure(type(**it), body[i].val);
}

ircd::prof::system::system(sample_t)
noexcept
{
	stop(group);
	hotsample(*this);
	start(group);
}

ircd::prof::system::~system()
noexcept
{
}

/*
	create(system::group, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CPU_CLOCK,          true,  false);
	create(system::group, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CPU_CLOCK,          false,  true);
	create(system::group, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_TASK_CLOCK,         true,  false);
	create(system::group, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_TASK_CLOCK,         false,  true);
	create(system::group, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS_MIN,    true,  false);
	create(system::group, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS_MIN,    false,  true);
	create(system::group, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS_MAJ,    true,  false);
	create(system::group, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS_MAJ,    false,  true);
	create(system::group, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CONTEXT_SWITCHES,   true,  false);
	create(system::group, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CONTEXT_SWITCHES,   false,  true);
	create(system::group, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CPU_MIGRATIONS,     true,  false);
	create(system::group, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CPU_MIGRATIONS,     false,  true);

	system::group.clear();
*/

///////////////////////////////////////////////////////////////////////////////
//
// prof::event
//

ircd::prof::event::event(const int &group,
                         const uint32_t &type,
                         const uint64_t &config,
                         const bool &user,
                         const bool &kernel,
                         const bool &use_map)
:attr{[&]
{
	struct ::perf_event_attr ret {0};
	ret.size = sizeof(ret);

	ret.type = type;
	ret.config = config;
	ret.exclude_user = !user;
	ret.exclude_kernel = !kernel;

	ret.read_format |= PERF_FORMAT_GROUP;
	ret.read_format |= PERF_FORMAT_ID;
	ret.read_format |= PERF_FORMAT_TOTAL_TIME_ENABLED;
	ret.read_format |= PERF_FORMAT_TOTAL_TIME_RUNNING;

	ret.exclude_idle = true;
	ret.exclude_host = false;
	ret.exclude_hv = true;
	ret.exclude_guest = true;
	ret.exclude_callchain_user = true;
	ret.exclude_callchain_kernel = true;

	ret.disabled = true;
	return ret;
}()}
,fd{[this, &group]
{
	ulong flags(0);
	flags |= PERF_FLAG_FD_CLOEXEC;

	const int cpu(-1);
	const pid_t pid(0);
	return int(syscall<SYS_perf_event_open>(&attr, pid, cpu, group, flags));
}()}
,id{[this]
{
	uint64_t ret;
	syscall(::ioctl, int(fd), PERF_EVENT_IOC_ID, &ret);
	return ret;
}()}
,map_size
{
	use_map && type == PERF_TYPE_HARDWARE?
		size_t(1UL + 0UL) * info::page_size:
		0UL
}
,map{[this]
{
	int prot(0);
	prot |= PROT_READ;
	prot |= PROT_WRITE;

	int flags(0);
	flags |= MAP_SHARED;

	void *const ret
	{
		map_size?
			::mmap(nullptr, map_size, prot, flags, int(this->fd), 0):
			nullptr
	};

	if(ret == (void *)-1)
		throw std::system_error
		{
			errno, std::system_category()
		};

	if(map_size && ret == nullptr)
		throw error
		{
			"mmap(2) failed on event (fd:%d)", int(fd)
		};

	return reinterpret_cast<char *>(ret);
}()}
,head
{
	map?
		reinterpret_cast<::perf_event_mmap_page *>(map):
		nullptr
}
,body
{
	head?
		map + head->data_offset:
		nullptr,
	head?
		head->data_size:
		0UL
}
{
	assert(size(body) % info::page_size == 0);
	assert(map_size % info::page_size == 0);
}

ircd::prof::event::~event()
noexcept
{
	assert(!map || map_size);
	assert(!map_size || map);

	if(map)
		syscall(::munmap, map, map_size);
}

inline void
IRCD_PROF_ALWAYS_OPTIMIZE
ircd::prof::event::disable(const long &arg)
{
	::ioctl(int(fd), PERF_EVENT_IOC_DISABLE, arg);
}

inline void
IRCD_PROF_ALWAYS_OPTIMIZE
ircd::prof::event::enable(const long &arg)
{
	const int &fd(this->fd);

	#if defined(__x86_64__)
		__builtin_ia32_mfence();
		__builtin_ia32_lfence();
	#endif

	::ioctl(fd, PERF_EVENT_IOC_ENABLE, arg);
}

void
ircd::prof::event::reset(const long &arg)
{
	ioctl(PERF_EVENT_IOC_RESET, arg);
}

long
ircd::prof::event::ioctl(const ulong &req,
                         const long &arg)
{
	return syscall(::ioctl, int(fd), req, arg);
}

inline uint64_t
IRCD_PROF_ALWAYS_OPTIMIZE
ircd::prof::event::rdpmc()
const
{
	assert(head->cap_user_time);
	assert(head->cap_user_rdpmc);

	uint64_t ret;
	uint32_t seq; do
	{
		seq = head->lock;
		__sync_synchronize();
		//assert(head->time_enabled == head->time_running);
		ret = head->offset;
		ret += head->index? x86::rdpmc(head->index - 1) : 0UL;
		__sync_synchronize();
	}
	while(head->lock != seq);
	return ret;
}

///////////////////////////////////////////////////////////////////////////////
//
// prof::type
//

ircd::prof::type::type(const enum dpl &dpl,
                       const uint8_t &type_id,
                       const uint8_t &counter,
                       const uint8_t &cacheop,
                       const uint8_t &cacheres)
:dpl{dpl}
,type_id{type_id}
,counter{counter}
,cacheop{cacheop}
,cacheres{cacheres}
{
}

ircd::prof::type::type(const event &event)
:dpl
{
	event.attr.exclude_kernel? dpl::USER : dpl::KERNEL
}
,type_id
{
	uint8_t(event.attr.type)
}
,counter
{
	uint8_t(event.attr.config)
}
,cacheop
{
	uint8_t(event.attr.config >> 8)
}
,cacheres
{
	uint8_t(event.attr.config >> 16)
}
{
}

///////////////////////////////////////////////////////////////////////////////
//
// internal
//

std::ostream &
ircd::prof::debug(std::ostream &s,
                  const ::perf_event_mmap_page &head)
{
	s << "version:               " << head.version              << std::endl;
	s << "compat:                " << head.compat_version       << std::endl;
	s << "lock:                  " << head.lock                 << std::endl;
	s << "index:                 " << head.index                << std::endl;
	s << "offset:                " << head.offset               << std::endl;
	s << "time_enabled:          " << head.time_enabled         << std::endl;
	s << "time_running:          " << head.time_running         << std::endl;
	s << "cap_user_rdpmc:        " << head.cap_user_rdpmc       << std::endl;
	s << "cap_user_time:         " << head.cap_user_time        << std::endl;
	s << "cap_user_time_zero:    " << head.cap_user_time_zero   << std::endl;
	s << "pmc_width:             " << head.pmc_width            << std::endl;
	s << "time_shift:            " << head.time_shift           << std::endl;
	s << "time_mult:             " << head.time_mult            << std::endl;
	s << "time_offset:           " << head.time_offset          << std::endl;
	s << "data_head:             " << head.data_head            << std::endl;
	s << "data_tail:             " << head.data_tail            << std::endl;
	s << "data_offset:           " << head.data_offset          << std::endl;
	s << "data_size:             " << head.data_size            << std::endl;
	s << "aux_head:              " << head.aux_head             << std::endl;
	s << "aux_tail:              " << head.aux_tail             << std::endl;
	s << "aux_offset:            " << head.aux_offset           << std::endl;
	s << "aux_size:              " << head.aux_size             << std::endl;

	return s;
}