/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _GEN_CNA_LOCK_SLOWPATH
#error "do not include this file"
#endif

#include <linux/topology.h>
#include <linux/sched/clock.h>
#include <linux/moduleparam.h>
#include <linux/sched/rt.h>
#include <linux/random.h>

/*
 * Implement a NUMA-aware version of MCS (aka CNA, or compact NUMA-aware lock).
 *
 * In CNA, spinning threads are organized in two queues, a primary queue for
 * threads running on the same NUMA node as the current lock holder, and a
 * secondary queue for threads running on other nodes. Schematically, it
 * looks like this:
 *
 *    cna_node
 *   +----------+     +--------+         +--------+
 *   |mcs:next  | --> |mcs:next| --> ... |mcs:next| --> NULL  [Primary queue]
 *   |mcs:locked| -.  +--------+         +--------+
 *   +----------+  |
 *                 `----------------------.
 *                                        v
 *                 +--------+         +--------+
 *                 |mcs:next| --> ... |mcs:next|            [Secondary queue]
 *                 +--------+         +--------+
 *                     ^                    |
 *                     `--------------------'
 *
 * N.B. locked := 1 if secondary queue is absent. Otherwise, it contains the
 * encoded pointer to the tail of the secondary queue, which is organized as a
 * circular list.
 *
 * After acquiring the MCS lock and before acquiring the spinlock, the MCS lock
 * holder checks whether the next waiter in the primary queue (if exists) is
 * running on the same NUMA node. If it is not, that waiter is detached from the
 * main queue and moved into the tail of the secondary queue. This way, we
 * gradually filter the primary queue, leaving only waiters running on the same
 * preferred NUMA node. Note that certain priortized waiters (e.g., in
 * irq and nmi contexts) are excluded from being moved to the secondary queue.
 *
 * We change the NUMA node preference after a waiter at the head of the
 * secondary queue spins for a certain amount of time (1ms, by default).
 * We do that by flushing the secondary queue into the head of the primary queue,
 * effectively changing the preference to the NUMA node of the waiter at the head
 * of the secondary queue at the time of the flush.
 *
 * For more details, see https://arxiv.org/abs/1810.05600.
 *
 * Authors: Alex Kogan <alex.kogan@oracle.com>
 *          Dave Dice <dave.dice@oracle.com>
 */

#define FLUSH_SECONDARY_QUEUE	1

#define CNA_PRIORITY_NODE      0xffff

struct cna_node {
	struct mcs_spinlock	mcs;
	u16			numa_node;
	u16			real_numa_node;
	u32			encoded_tail;	/* self */
	u64			start_time;
};

static ulong numa_spinlock_threshold_ns = 1000000;   /* 1ms, by default */
module_param(numa_spinlock_threshold_ns, ulong, 0644);

static inline bool intra_node_threshold_reached(struct cna_node *cn)
{
	u64 current_time = local_clock();
	u64 threshold = cn->start_time + numa_spinlock_threshold_ns;

	return current_time > threshold;
}

/*
 * Controls the probability for enabling the ordering of the main queue
 * when the secondary queue is empty. The chosen value reduces the amount
 * of unnecessary shuffling of threads between the two waiting queues
 * when the contention is low, while responding fast enough and enabling
 * the shuffling when the contention is high.
 */
#define SHUFFLE_REDUCTION_PROB_ARG  (7)

/* Per-CPU pseudo-random number seed */
static DEFINE_PER_CPU(u32, seed);

/*
 * Return false with probability 1 / 2^@num_bits.
 * Intuitively, the larger @num_bits the less likely false is to be returned.
 * @num_bits must be a number between 0 and 31.
 */
static bool probably(unsigned int num_bits)
{
	u32 s;

	s = this_cpu_read(seed);
	s = next_pseudo_random32(s);
	this_cpu_write(seed, s);

	return s & ((1 << num_bits) - 1);
}

static void __init cna_init_nodes_per_cpu(unsigned int cpu)
{
	struct mcs_spinlock *base = per_cpu_ptr(&qnodes[0].mcs, cpu);
	int numa_node = cpu_to_node(cpu);
	int i;

	for (i = 0; i < MAX_NODES; i++) {
		struct cna_node *cn = (struct cna_node *)grab_mcs_node(base, i);

		cn->real_numa_node = numa_node;
		cn->encoded_tail = encode_tail(cpu, i);
		/*
		 * make sure @encoded_tail is not confused with other valid
		 * values for @locked (0 or 1)
		 */
		WARN_ON(cn->encoded_tail <= 1);
	}
}

static int __init cna_init_nodes(void)
{
	unsigned int cpu;

	/*
	 * this will break on 32bit architectures, so we restrict
	 * the use of CNA to 64bit only (see arch/x86/Kconfig)
	 */
	BUILD_BUG_ON(sizeof(struct cna_node) > sizeof(struct qnode));
	/* we store an ecoded tail word in the node's @locked field */
	BUILD_BUG_ON(sizeof(u32) > sizeof(unsigned int));

	for_each_possible_cpu(cpu)
		cna_init_nodes_per_cpu(cpu);

	return 0;
}

static __always_inline void cna_init_node(struct mcs_spinlock *node)
{
	bool priority = !in_task() || irqs_disabled() || rt_task(current);
	struct cna_node *cn = (struct cna_node *)node;

	cn->numa_node = priority ? CNA_PRIORITY_NODE : cn->real_numa_node;
	cn->start_time = 0;
}

/*
 * cna_splice_head -- splice the entire secondary queue onto the head of the
 * primary queue.
 *
 * Returns the new primary head node or NULL on failure.
 */
static struct mcs_spinlock *
cna_splice_head(struct qspinlock *lock, u32 val,
		struct mcs_spinlock *node, struct mcs_spinlock *next)
{
	struct mcs_spinlock *head_2nd, *tail_2nd;
	u32 new;

	tail_2nd = decode_tail(node->locked);
	head_2nd = tail_2nd->next;

	if (next) {
		/*
		 * If the primary queue is not empty, the primary tail doesn't
		 * need to change and we can simply link the secondary tail to
		 * the old primary head.
		 */
		tail_2nd->next = next;
	} else {
		/*
		 * When the primary queue is empty, the secondary tail becomes
		 * the primary tail.
		 */

		/*
		 * Speculatively break the secondary queue's circular link such
		 * that when the secondary tail becomes the primary tail it all
		 * works out.
		 */
		tail_2nd->next = NULL;

		/*
		 * tail_2nd->next = NULL;	old = xchg_tail(lock, tail);
		 *				prev = decode_tail(old);
		 * try_cmpxchg_release(...);	WRITE_ONCE(prev->next, node);
		 *
		 * If the following cmpxchg() succeeds, our stores will not
		 * collide.
		 */
		new = ((struct cna_node *)tail_2nd)->encoded_tail |
			_Q_LOCKED_VAL;
		if (!atomic_try_cmpxchg_release(&lock->val, &val, new)) {
			/* Restore the secondary queue's circular link. */
			tail_2nd->next = head_2nd;
			return NULL;
		}
	}

	/* The primary queue head now is what was the secondary queue head. */
	return head_2nd;
}

static inline bool cna_try_clear_tail(struct qspinlock *lock, u32 val,
				      struct mcs_spinlock *node)
{
	/*
	 * We're here because the primary queue is empty; check the secondary
	 * queue for remote waiters.
	 */
	if (node->locked > 1) {
		struct mcs_spinlock *next;

		/*
		 * When there are waiters on the secondary queue, try to move
		 * them back onto the primary queue and let them rip.
		 */
		next = cna_splice_head(lock, val, node, NULL);
		if (next) {
			arch_mcs_lock_handoff(&next->locked, 1);
			return true;
		}

		return false;
	}

	/* Both queues are empty. Do what MCS does. */
	return __try_clear_tail(lock, val, node);
}

/*
 * cna_splice_next -- splice the next node from the primary queue onto
 * the secondary queue.
 */
static void cna_splice_next(struct mcs_spinlock *node,
			    struct mcs_spinlock *next,
			    struct mcs_spinlock *nnext)
{
	/* remove 'next' from the main queue */
	node->next = nnext;

	/* stick `next` on the secondary queue tail */
	if (node->locked <= 1) { /* if secondary queue is empty */
		struct cna_node *cn = (struct cna_node *)node;

		/* create secondary queue */
		next->next = next;

		cn->start_time = local_clock();
		/* secondary queue is not empty iff start_time != 0 */
		WARN_ON(!cn->start_time);
	} else {
		/* add to the tail of the secondary queue */
		struct mcs_spinlock *tail_2nd = decode_tail(node->locked);
		struct mcs_spinlock *head_2nd = tail_2nd->next;

		tail_2nd->next = next;
		next->next = head_2nd;
	}

	node->locked = ((struct cna_node *)next)->encoded_tail;
}

/*
 * cna_order_queue - check whether the next waiter in the main queue is on
 * the same NUMA node as the lock holder; if not, and it has a waiter behind
 * it in the main queue, move the former onto the secondary queue.
 * Returns 1 if the next waiter runs on the same NUMA node; 0 otherwise.
 */
static int cna_order_queue(struct mcs_spinlock *node)
{
	struct mcs_spinlock *next = READ_ONCE(node->next);
	struct cna_node *cn = (struct cna_node *)node;
	int numa_node, next_numa_node;

	if (!next)
		return 0;

	numa_node = cn->numa_node;
	next_numa_node = ((struct cna_node *)next)->numa_node;

	if (next_numa_node != numa_node && next_numa_node != CNA_PRIORITY_NODE) {
		struct mcs_spinlock *nnext = READ_ONCE(next->next);

		if (nnext)
			cna_splice_next(node, next, nnext);

		return 0;
	}
	return 1;
}

#define LOCK_IS_BUSY(lock) (atomic_read(&(lock)->val) & _Q_LOCKED_PENDING_MASK)

/* Abuse the pv_wait_head_or_lock() hook to get some work done */
static __always_inline u32 cna_wait_head_or_lock(struct qspinlock *lock,
						 struct mcs_spinlock *node)
{
	struct cna_node *cn = (struct cna_node *)node;

	if (node->locked <= 1 && probably(SHUFFLE_REDUCTION_PROB_ARG)) {
		/*
		 * When the secondary queue is empty, skip the calls to
		 * cna_order_queue() below with high probability. This optimization
		 * reduces the overhead of unnecessary shuffling of threads
		 * between waiting queues when the lock is only lightly contended.
		 */
		return 0;
	}

	if (!cn->start_time || !intra_node_threshold_reached(cn)) {
		/*
		 * We are at the head of the wait queue, no need to use
		 * the fake NUMA node ID.
		 */
		if (cn->numa_node == CNA_PRIORITY_NODE)
			cn->numa_node = cn->real_numa_node;

		/*
		 * Try and put the time otherwise spent spin waiting on
		 * _Q_LOCKED_PENDING_MASK to use by sorting our lists.
		 */
		while (LOCK_IS_BUSY(lock) && !cna_order_queue(node))
			cpu_relax();
	} else {
		cn->start_time = FLUSH_SECONDARY_QUEUE;
	}

	return 0; /* we lied; we didn't wait, go do so now */
}

static inline void cna_lock_handoff(struct mcs_spinlock *node,
				 struct mcs_spinlock *next)
{
	struct cna_node *cn = (struct cna_node *)node;
	u32 val = 1;

	if (cn->start_time != FLUSH_SECONDARY_QUEUE) {
		if (node->locked > 1) {
			val = node->locked;	/* preseve secondary queue */

			/*
			 * We have a local waiter, either real or fake one;
			 * reload @next in case it was changed by cna_order_queue().
			 */
			next = node->next;

			/*
			 * Pass over NUMA node id of primary queue, to maintain the
			 * preference even if the next waiter is on a different node.
			 */
			((struct cna_node *)next)->numa_node = cn->numa_node;

			((struct cna_node *)next)->start_time = cn->start_time;
		}
	} else {
		/*
		 * We decided to flush the secondary queue;
		 * this can only happen if that queue is not empty.
		 */
		WARN_ON(node->locked <= 1);
		/*
		 * Splice the secondary queue onto the primary queue and pass the lock
		 * to the longest waiting remote waiter.
		 */
		next = cna_splice_head(NULL, 0, node, next);
	}

	arch_mcs_lock_handoff(&next->locked, val);
}

#ifdef CONFIG_PARAVIRT_SPINLOCKS
#define cna_queued_spin_lock_slowpath	pv_ops.lock.queued_spin_lock_slowpath
#else
void (*cna_queued_spin_lock_slowpath)(struct qspinlock *lock, u32 val) =
		native_queued_spin_lock_slowpath;
EXPORT_SYMBOL(cna_queued_spin_lock_slowpath);
#endif

/*
 * Constant (boot-param configurable) flag selecting the NUMA-aware variant
 * of spinlock.  Possible values: -1 (off, default) / 0 (auto) / 1 (on).
 */
static int numa_spinlock_flag = -1;

static int __init numa_spinlock_setup(char *str)
{
	if (!strcmp(str, "auto")) {
		numa_spinlock_flag = 0;
		return 1;
	} else if (!strcmp(str, "on")) {
		numa_spinlock_flag = 1;
		return 1;
	} else if (!strcmp(str, "off")) {
		numa_spinlock_flag = -1;
		return 1;
	}

	return 0;
}
__setup("numa_spinlock=", numa_spinlock_setup);

void __cna_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);

/*
 * Switch to the NUMA-friendly slow path for spinlocks when we have
 * multiple NUMA nodes in native environment, unless the user has
 * overridden this default behavior by setting the numa_spinlock flag.
 */
void __init cna_configure_spin_lock_slowpath(void)
{

	if (numa_spinlock_flag < 0)
		return;

	if (numa_spinlock_flag == 0 && (nr_node_ids < 2 ||
		    cna_queued_spin_lock_slowpath !=
			native_queued_spin_lock_slowpath))
		return;

	cna_init_nodes();

	cna_queued_spin_lock_slowpath = __cna_queued_spin_lock_slowpath;

	pr_info("Enabling CNA spinlock\n");
}