Linux Interrupt

在面试的时候我们常常被问及一个问题：几种中断下半部机制 softirq、tasklet、workqueue 有什么区别？Linux 为什么要设计这几种机制？真正能够回答清楚的人还是少数的。下面我们就详细分析一下这其中的区别。

本文的代码分析基于 Linux kernel 3.18.22 和 arm64 架构，最好的学习方法还是 “RTFSC”

1. Linux 中断

arm64 和其他所有 CPU 架构的中断处理流程都是一样：正常执行流程被打断进入中断服务程序，保护现场、处理中断、恢复现场：

在整个中断处理过程中，arm64 的 CPU 全局中断是自动 disable 的 (PSTATE 寄存器中的 interrupt bit 被 masks)。如果用户想支持 interrupt nested，需要自己在中断服务程序中使能中断。Linux 现在是不使用中断嵌套的。

中断嵌套处理 ^ARMPG

1.1 CPU 中断打开 / 关闭

arm64 关闭和打开本地 CPU 的全局中断的方法，是操作 SPSR(Saved Process Status Register) 寄存器 IRQ mask bit。

Arm64 SPSR ^ARMPG

Linux 中 arm64 关闭和打开本地 CPU 中断的函数实现。

arch/arm64/include/asm/irqflags.h:
local_irq_disable() -> raw_local_irq_disable() -> arch_local_irq_disable()
local_irq_enable() -> raw_local_irq_enable() -> arch_local_irq_enable()


static inline void arch_local_irq_enable(void)
{
	asm volatile(
		// (1) 清除 DAIF 中的 bit2 I 标志位，打开中断
		"msr	daifclr, #2		// arch_local_irq_enable"
		:
		:
		: "memory");
}

static inline void arch_local_irq_disable(void)
{
	asm volatile(
		// (2) 设置 DAIF 中的 bit2 I 标志位，关闭中断
		"msr	daifset, #2		// arch_local_irq_disable"
		:
		:
		: "memory");
}

static inline unsigned long arch_local_irq_save(void)
{
	unsigned long flags;
	asm volatile(
		// (3) 备份 DAIF 标志
		"mrs	%0, daif		// arch_local_irq_save\n"
		"msr	daifset, #2"
		: "=r" (flags)
		:
		: "memory");
	return flags;
}

static inline unsigned long arch_local_save_flags(void)
{
	unsigned long flags;
	asm volatile(
		// (4) 恢复 DAIF 标志
		"mrs	%0, daif		// arch_local_save_flags"
		: "=r" (flags)
		:
		: "memory");
	return flags;
}

1.2 中断控制器 GIC

上面描述了 CPU 对全局中断的处理，但是还有一个工作需要有人去做：就是把外部中断、内部中断、CPU 间中断等各种中断按照优先级、亲和力、私有性等发送给多个 CPU。负责这个工作的就是中断控制器 GIC(Generic Interrupt Controller)。

GIC400 ^GICANALY

从软件角度上看，GIC 可以分成两个功能模块：^ARMPG

Distributor。负责连接系统中所有的中断源，通过寄存器可以独立的配置每个中断的属性：priority、state、security、outing information、enable status。定义哪些中断可以转发到 CPU core。
CPU Interface。CPU core 用来接收中断，寄存器主要提供的功能：mask、 identify 、control states of interrupts forwarded to that core。每个 CPU core 拥有自己的 CPU interface。

对 GIC 来说，中断可以分成以下几种类型：^ARMPG

SGI(Software Generated Interrupt)，Interrupt IDs 0-15。系统一般用其来实现 IPI 中断。
PPI(Private Peripheral Interrupt)，Interrupt IDs16-31。私有中断，这种中断对每个 CPU 都是独立一份的，比如 per-core timer 中断。
SPI(Shared Peripheral Interrupt)，Interrupt numbers 32-1020。最常用的外设中断，中断可以发给一个或者多个 CPU。
LPI(Locality-specific Peripheral Interrupt)。基于 message 的中断，GICv2 和 GICv1 中不支持。

GIC 从原理上理解并不难，但是如果涉及到级联等技术细节，整个初始化过程还是比较复杂。大家可以自行下载 GIC 手册：GIC-400、GIC-500 学习，GIC 代码分析也是一篇很不错的分析文章。

一款 GIC 相关的操作函数都会集中到 irq_chip 数据结构中，以 GIC-400 为例，它的相关操作函数如下：

drivers/irqchip/irq-gic.c:

static struct irq_chip gic_chip = {
	.name			= "GIC",
	.irq_mask		= gic_mask_irq,
	.irq_unmask		= gic_unmask_irq,
	.irq_eoi		= gic_eoi_irq,
	.irq_set_type		= gic_set_type,
	.irq_retrigger		= gic_retrigger,
#ifdef CONFIG_SMP
	.irq_set_affinity	= gic_set_affinity,
#endif
	.irq_set_wake		= gic_set_wake,
};

1.3 Linux 中断处理流程

从代码上看 Linux 中断的处理流程大概是这样的：

Linux 中断处理基本流程

从处理流程上看，对于 gic 的每个中断源，Linux 系统分配一个 irq_desc 数据结构与之对应。irq_desc 结构中有两个中断处理函数 desc->handle_irq() 和 desc->action->handler()，这两个函数代表中断处理的两个层级：

desc->handle_irq()。第一层次的中断处理函数，这个是系统在初始化时根据中断源的特征统一分配的，不同类型的中断源的 gic 操作是不一样的，把这些通用 gic 操作提取出来就是第一层次的操作函数。具体实现包括：
- handle_fasteoi_irq()
- handle_simple_irq()
- handle_edge_irq()
- handle_level_irq()
- handle_percpu_irq()
- handle_percpu_devid_irq()
desc->action->handler() 第二层次的中断处理函数，由用户注册实现具体设备的驱动服务程序，都是和 GIC 操作无关的代码。同时一个中断源可以多个设备共享，所以一个 desc 可以挂载多个 action，由链表结构组织起来。

Linux 中断处理层级

1.4 中断服务注册

从上一节的中断二层结构中可以看到第二层的中断处理函数 desc->action->handler 是由用户来注册的，下面我们来分析具体注册过程：

kernel/irq/manage.c:
request_irq() -> request_threaded_irq() -> __setup_irq()

static inline int __must_check
request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags,
	    const char *name, void *dev)
{
	return request_threaded_irq(irq, handler, NULL, flags, name, dev);
}
| →
int request_threaded_irq(unsigned int irq, irq_handler_t handler,
			 irq_handler_t thread_fn, unsigned long irqflags,
			 const char *devname, void *dev_id)
{
	struct irqaction *action;
	struct irq_desc *desc;
	int retval;

	/*
	 * Sanity-check: shared interrupts must pass in a real dev-ID,
	 * otherwise we'll have trouble later trying to figure out
	 * which interrupt is which (messes up the interrupt freeing
	 * logic etc).
	 */
	if ((irqflags & IRQF_SHARED) && !dev_id)
		return -EINVAL;

	// (1) 根据中断号找到对应的 desc 结构
	desc = irq_to_desc(irq);
	if (!desc)
		return -EINVAL;

	if (!irq_settings_can_request(desc) ||
	    WARN_ON(irq_settings_is_per_cpu_devid(desc)))
		return -EINVAL;

	// (2) 如果 action->handler 为空，那么用户是想创建一个线程化中断
	// 将线程化中断的 action->handler 初始化为 irq_default_primary_handler()
	// irq_default_primary_handler() 非常简单，只是返回一个 IRQ_WAKE_THREAD 值
	if (!handler) {
		if (!thread_fn)
			return -EINVAL;
		handler = irq_default_primary_handler;
	}

	// (3) 分配新的 action 数据结构
	action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
	if (!action)
		return -ENOMEM;

	action->handler = handler;
	action->thread_fn = thread_fn;
	action->flags = irqflags;
	action->name = devname;
	action->dev_id = dev_id;

	chip_bus_lock(desc);
	// (4) 将新的 action 结构安装到 desc 中
	retval = __setup_irq(irq, desc, action);
	chip_bus_sync_unlock(desc);

	if (retval)
		kfree(action);

#ifdef CONFIG_DEBUG_SHIRQ_FIXME
	if (!retval && (irqflags & IRQF_SHARED)) {
		/*
		 * It's a shared IRQ -- the driver ought to be prepared for it
		 * to happen immediately, so let's make sure....
		 * We disable the irq to make sure that a 'real' IRQ doesn't
		 * run in parallel with our fake.
		 */
		unsigned long flags;

		disable_irq(irq);
		local_irq_save(flags);

		handler(irq, dev_id);

		local_irq_restore(flags);
		enable_irq(irq);
	}
#endif
	return retval;
}
|| →
static int
__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
{
	struct irqaction *old, **old_ptr;
	unsigned long flags, thread_mask = 0;
	int ret, nested, shared = 0;
	cpumask_var_t mask;

	if (!desc)
		return -EINVAL;

	if (desc->irq_data.chip == &no_irq_chip)
		return -ENOSYS;
	if (!try_module_get(desc->owner))
		return -ENODEV;

	/*
	 * Check whether the interrupt nests into another interrupt
	 * thread.
	 */
	nested = irq_settings_is_nested_thread(desc);
	// (4.1) 判断中断是否是支持嵌套
	if (nested) {
		if (!new->thread_fn) {
			ret = -EINVAL;
			goto out_mput;
		}
		/*
		 * Replace the primary handler which was provided from
		 * the driver for non nested interrupt handling by the
		 * dummy function which warns when called.
		 */
		new->handler = irq_nested_primary_handler;
	} else {
		// (4.2) 判断中断是否可以被线程化
		// 如果中断没有设置 _IRQ_NOTHREAD 标志 & 强制中断线程化标志被设置 (force_irqthreads=1)
		// 强制把中断线程化：
		// new->thread_fn = new->handler;new->handler = irq_default_primary_handler;
		if (irq_settings_can_thread(desc))
			irq_setup_forced_threading(new);
	}

	/*
	 * Create a handler thread when a thread function is supplied
	 * and the interrupt does not nest into another interrupt
	 * thread.
	 */
	// (4.3) 如果是线程化中断，创建线程化中断对应的线程
	if (new->thread_fn && !nested) {
		struct task_struct *t;
		static const struct sched_param param = {
			.sched_priority = MAX_USER_RT_PRIO/2,
		};

		// 创建线程
		t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
				   new->name);
		if (IS_ERR(t)) {
			ret = PTR_ERR(t);
			goto out_mput;
		}

		sched_setscheduler_nocheck(t, SCHED_FIFO, &param);

		/*
		 * We keep the reference to the task struct even if
		 * the thread dies to avoid that the interrupt code
		 * references an already freed task_struct.
		 */
		get_task_struct(t);
		// 赋值给 ->thread 成员
		new->thread = t;
		/*
		 * Tell the thread to set its affinity. This is
		 * important for shared interrupt handlers as we do
		 * not invoke setup_affinity() for the secondary
		 * handlers as everything is already set up. Even for
		 * interrupts marked with IRQF_NO_BALANCE this is
		 * correct as we want the thread to move to the cpu(s)
		 * on which the requesting code placed the interrupt.
		 */
		set_bit(IRQTF_AFFINITY, &new->thread_flags);
	}

	if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
		ret = -ENOMEM;
		goto out_thread;
	}

	/*
	 * Drivers are often written to work w/o knowledge about the
	 * underlying irq chip implementation, so a request for a
	 * threaded irq without a primary hard irq context handler
	 * requires the ONESHOT flag to be set. Some irq chips like
	 * MSI based interrupts are per se one shot safe. Check the
	 * chip flags, so we can avoid the unmask dance at the end of
	 * the threaded handler for those.
	 */
	if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)
		new->flags &= ~IRQF_ONESHOT;

	/*
	 * The following block of code has to be executed atomically
	 */
	// (4.4) 找到最后一个 action 结构
	raw_spin_lock_irqsave(&desc->lock, flags);
	old_ptr = &desc->action;
	old = *old_ptr;
	if (old) {
		/*
		 * Can't share interrupts unless both agree to and are
		 * the same type (level, edge, polarity). So both flag
		 * fields must have IRQF_SHARED set and the bits which
		 * set the trigger type must match. Also all must
		 * agree on ONESHOT.
		 */
		if (!((old->flags & new->flags) & IRQF_SHARED) ||
		    ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
		    ((old->flags ^ new->flags) & IRQF_ONESHOT))
			goto mismatch;

		/* All handlers must agree on per-cpuness */
		if ((old->flags & IRQF_PERCPU) !=
		    (new->flags & IRQF_PERCPU))
			goto mismatch;

		/* add new interrupt at end of irq queue */
		do {
			/*
			 * Or all existing action->thread_mask bits,
			 * so we can find the next zero bit for this
			 * new action.
			 */
			thread_mask |= old->thread_mask;
			old_ptr = &old->next;
			old = *old_ptr;
		} while (old);
		// 如果有多个 action，共享标志设为 1
		shared = 1;
	}

	/*
	 * Setup the thread mask for this irqaction for ONESHOT. For
	 * !ONESHOT irqs the thread mask is 0 so we can avoid a
	 * conditional in irq_wake_thread().
	 */
	if (new->flags & IRQF_ONESHOT) {
		/*
		 * Unlikely to have 32 resp 64 irqs sharing one line,
		 * but who knows.
		 */
		if (thread_mask == ~0UL) {
			ret = -EBUSY;
			goto out_mask;
		}
		/*
		 * The thread_mask for the action is or'ed to
		 * desc->thread_active to indicate that the
		 * IRQF_ONESHOT thread handler has been woken, but not
		 * yet finished. The bit is cleared when a thread
		 * completes. When all threads of a shared interrupt
		 * line have completed desc->threads_active becomes
		 * zero and the interrupt line is unmasked. See
		 * handle.c:irq_wake_thread() for further information.
		 *
		 * If no thread is woken by primary (hard irq context)
		 * interrupt handlers, then desc->threads_active is
		 * also checked for zero to unmask the irq line in the
		 * affected hard irq flow handlers
		 * (handle_[fasteoi|level]_irq).
		 *
		 * The new action gets the first zero bit of
		 * thread_mask assigned. See the loop above which or's
		 * all existing action->thread_mask bits.
		 */
		new->thread_mask = 1 << ffz(thread_mask);

	} else if (new->handler == irq_default_primary_handler &&
		   !(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) {
		/*
		 * The interrupt was requested with handler = NULL, so
		 * we use the default primary handler for it. But it
		 * does not have the oneshot flag set. In combination
		 * with level interrupts this is deadly, because the
		 * default primary handler just wakes the thread, then
		 * the irq lines is reenabled, but the device still
		 * has the level irq asserted. Rinse and repeat....
		 *
		 * While this works for edge type interrupts, we play
		 * it safe and reject unconditionally because we can't
		 * say for sure which type this interrupt really
		 * has. The type flags are unreliable as the
		 * underlying chip implementation can override them.
		 */
		pr_err("Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n",
		       irq);
		ret = -EINVAL;
		goto out_mask;
	}

	// (4.5) 如果是第一个 action，做一些初始化工作
	if (!shared) {
		ret = irq_request_resources(desc);
		if (ret) {
			pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n",
			       new->name, irq, desc->irq_data.chip->name);
			goto out_mask;
		}

		init_waitqueue_head(&desc->wait_for_threads);

		/* Setup the type (level, edge polarity) if configured: */
		if (new->flags & IRQF_TRIGGER_MASK) {
			ret = __irq_set_trigger(desc, irq,
					new->flags & IRQF_TRIGGER_MASK);

			if (ret)
				goto out_mask;
		}

		desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
				  IRQS_ONESHOT | IRQS_WAITING);
		irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);

		if (new->flags & IRQF_PERCPU) {
			irqd_set(&desc->irq_data, IRQD_PER_CPU);
			irq_settings_set_per_cpu(desc);
		}

		if (new->flags & IRQF_ONESHOT)
			desc->istate |= IRQS_ONESHOT;

		if (irq_settings_can_autoenable(desc))
			irq_startup(desc, true);
		else
			/* Undo nested disables: */
			desc->depth = 1;

		/* Exclude IRQ from balancing if requested */
		if (new->flags & IRQF_NOBALANCING) {
			irq_settings_set_no_balancing(desc);
			irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
		}

		// 设置中断亲和力
		/* Set default affinity mask once everything is setup */
		setup_affinity(irq, desc, mask);
	} else if (new->flags & IRQF_TRIGGER_MASK) {
		unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK;
		unsigned int omsk = irq_settings_get_trigger_mask(desc);

		if (nmsk != omsk)
			/* hope the handler works with current  trigger mode */
			pr_warning("irq %d uses trigger mode %u; requested %u\n",
				   irq, nmsk, omsk);
	}

	// (4.6) 将新的 action 插入到 desc 链表中
	new->irq = irq;
	*old_ptr = new;

	irq_pm_install_action(desc, new);

	/* Reset broken irq detection when installing new handler */
	desc->irq_count = 0;
	desc->irqs_unhandled = 0;

	/*
	 * Check whether we disabled the irq via the spurious handler
	 * before. Reenable it and give it another chance.
	 */
	// (4.7) 如果中断之前被虚假 disable 了，重新 enable 中断
	if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) {
		desc->istate &= ~IRQS_SPURIOUS_DISABLED;
		__enable_irq(desc, irq);
	}

	raw_spin_unlock_irqrestore(&desc->lock, flags);

	/*
	 * Strictly no need to wake it up, but hung_task complains
	 * when no hard interrupt wakes the thread up.
	 */
	// (4.8) 唤醒线程化中断对应的线程
	if (new->thread)
		wake_up_process(new->thread);

	register_irq_proc(irq, desc);
	new->dir = NULL;
	register_handler_proc(irq, new);
	free_cpumask_var(mask);

	return 0;

mismatch:
	if (!(new->flags & IRQF_PROBE_SHARED)) {
		pr_err("Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n",
		       irq, new->flags, new->name, old->flags, old->name);
#ifdef CONFIG_DEBUG_SHIRQ
		dump_stack();
#endif
	}
	ret = -EBUSY;

out_mask:
	raw_spin_unlock_irqrestore(&desc->lock, flags);
	free_cpumask_var(mask);

out_thread:
	if (new->thread) {
		struct task_struct *t = new->thread;

		new->thread = NULL;
		kthread_stop(t);
		put_task_struct(t);
	}
out_mput:
	module_put(desc->owner);
	return ret;
}

1.5 中断线程化

从上一节可以看到，使用 request_irq() 注册的是传统中断，而直接使用 request_threaded_irq() 注册的是线程化中断。线程化中断的主要目的是把中断上下文的任务迁移到线程中，减少系统关中断的时间，增强系统的实时性。

中断对应的线程命名规则为：

1	t = kthread_create(irq_thread, new, "irq/%d-%s", irq, new->name);

我们通过 ps 命令查看系统中的中断线程，注意这些线程是实时线程 SCHED_FIFO：

root@:/ # ps | grep "irq/"
root      171   2     0      0     irq_thread 0000000000 S irq/389-charger
root      239   2     0      0     irq_thread 0000000000 S irq/296-PS_int-
root      247   2     0      0     irq_thread 0000000000 S irq/297-1124000
root      1415  2     0      0     irq_thread 0000000000 S irq/293-goodix_
root@a0255:/ #

线程化中断的创建和处理任务流程如下：

Linux 线程化中断

线程和 action 是一一对应的，即用户注册一个中断处理程序对应一个中断线程。

1.6 外设中断打开 / 关闭

前面的章节讲述了本地 CPU 全局中断的 enable/disable。如果要操作单个中断源的 enable/disable，使用 enable_irq()/disable_irq() 函数。最后调用主要是 GIC chip 相关的函数：

kernel/irq/manage.c:
enable_irq() -> __enable_irq() -> irq_enable()

void enable_irq(unsigned int irq)
{
	unsigned long flags;
	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);

	if (!desc)
		return;
	if (WARN(!desc->irq_data.chip,
		 KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
		goto out;

	__enable_irq(desc, irq);
out:
	irq_put_desc_busunlock(desc, flags);
}
| →
void __enable_irq(struct irq_desc *desc, unsigned int irq)
{
	switch (desc->depth) {
	case 0:
 err_out:
		WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
		break;
	case 1: {
		if (desc->istate & IRQS_SUSPENDED)
			goto err_out;
		/* Prevent probing on this irq: */
		irq_settings_set_noprobe(desc);
		irq_enable(desc);
		check_irq_resend(desc, irq);
		/* fall-through */
	}
	default:
		desc->depth--;
	}
}
|| →
void irq_enable(struct irq_desc *desc)
{
	// 操作 GIC chip 对应的函数
	irq_state_clr_disabled(desc);
	if (desc->irq_data.chip->irq_enable)
		desc->irq_data.chip->irq_enable(&desc->irq_data);
	else
		desc->irq_data.chip->irq_unmask(&desc->irq_data);
	irq_state_clr_masked(desc);
}

kernel/irq/manage.c:
enable_irq() -> __enable_irq() -> irq_enable()

void disable_irq(unsigned int irq)
{
	if (!__disable_irq_nosync(irq))
		synchronize_irq(irq);
}
| →
static int __disable_irq_nosync(unsigned int irq)
{
	unsigned long flags;
	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);

	if (!desc)
		return -EINVAL;
	__disable_irq(desc, irq);
	irq_put_desc_busunlock(desc, flags);
	return 0;
}
|| →
void __disable_irq(struct irq_desc *desc, unsigned int irq)
{
	if (!desc->depth++)
		irq_disable(desc);
}
||| →
void irq_disable(struct irq_desc *desc)
{
	// 操作 GIC chip 对应的函数
	irq_state_set_disabled(desc);
	if (desc->irq_data.chip->irq_disable) {
		desc->irq_data.chip->irq_disable(&desc->irq_data);
		irq_state_set_masked(desc);
	}
}

| →
void synchronize_irq(unsigned int irq)
{
	struct irq_desc *desc = irq_to_desc(irq);

	if (desc) {
		__synchronize_hardirq(desc);
		/*
		 * We made sure that no hardirq handler is
		 * running. Now verify that no threaded handlers are
		 * active.
		 */
		// 如果是线程化中断，需要等到线程执行完成
		wait_event(desc->wait_for_threads,
			   !atomic_read(&desc->threads_active));
	}
}

1.7 中断亲和力

同样基于 GIC chip 提供的能力，我们能配置中断源对 CPU 的亲和力。

kernel/irq/manage.c:
enable_irq() -> __enable_irq() -> irq_enable()

static inline int
irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
{
	return __irq_set_affinity(irq, cpumask, false);
}
| →
int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force)
{
	struct irq_desc *desc = irq_to_desc(irq);
	unsigned long flags;
	int ret;

	if (!desc)
		return -EINVAL;

	raw_spin_lock_irqsave(&desc->lock, flags);
	ret = irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force);
	raw_spin_unlock_irqrestore(&desc->lock, flags);
	return ret;
}
|| →
int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
			    bool force)
{
	struct irq_chip *chip = irq_data_get_irq_chip(data);
	struct irq_desc *desc = irq_data_to_desc(data);
	int ret = 0;

	if (!chip || !chip->irq_set_affinity)
		return -EINVAL;

	if (irq_can_move_pcntxt(data)) {
		ret = irq_do_set_affinity(data, mask, force);
	} else {
		irqd_set_move_pending(data);
		irq_copy_pending(desc, mask);
	}

	if (desc->affinity_notify) {
		kref_get(&desc->affinity_notify->kref);
		schedule_work(&desc->affinity_notify->work);
	}
	irqd_set(data, IRQD_AFFINITY_SET);

	return ret;
}
||| →
int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
			bool force)
{
	struct irq_desc *desc = irq_data_to_desc(data);
	struct irq_chip *chip = irq_data_get_irq_chip(data);
	int ret;

	// 操作 GIC chip 对应的函数
	ret = chip->irq_set_affinity(data, mask, force);
	switch (ret) {
	case IRQ_SET_MASK_OK:
	case IRQ_SET_MASK_OK_DONE:
#ifdef CONFIG_MTK_IRQ_NEW_DESIGN
		update_affinity_settings(desc, mask, true);
#else
		cpumask_copy(data->affinity, mask);
#endif
	case IRQ_SET_MASK_OK_NOCOPY:
		irq_set_thread_affinity(desc);
		ret = 0;
	}

	return ret;
}

2. Linux 中断下半部

接下来就是大名鼎鼎的中断下半部了，包括：softirq、tasklet、workqueue。中断下半部的主要目的就是减少系统关中断的时间，把关键代码放在中断中做，大部分处理代码放到不用关中断的空间去做。

上面有最激进的方法中断线程化，但是大部分时候还是需要用到中断上、下半部的方法。

workqueue 在另外文章中已经有详细解析，本处只解析 softirq、tasklet。

2.1 preempt_count

static __always_inline int preempt_count(void)
{
	return current_thread_info()->preempt_count;	/* 0 => preemptable, <0 => bug */
}

开始之前先了解一下 preempt_count 这个背景知识，preempt_count 是 thread_info 结构中的一个字段，用来表示当前进程能否被抢占。

所谓的抢占：是指在进程在内核空间运行，如果主动不释放 CPU，在时间片用完或者高优先级任务就绪的情况下，会被强行剥夺掉 CPU 的使用权。

但是进程可能在做一些关键操作，不能被抢占，被抢占后系统会出错。所以 Linux 设计了 preempt_count 字段，=0 可以被抢占，>0 不能被抢占。

进程在中断返回内核态时，做是否可抢占的检查：

arch/arm64/kernel/entry.s:
el1_irq() -> __enable_irq() -> irq_enable()

	.align	6
el1_irq:
	kernel_entry 1
	enable_dbg
#ifdef CONFIG_TRACE_IRQFLAGS
	bl	trace_hardirqs_off
#endif
#ifdef CONFIG_MTPROF
	bl  MT_trace_hardirqs_off
#endif
	irq_handler

#ifdef CONFIG_PREEMPT
	get_thread_info tsk
	ldr	w24, [tsk, #TI_PREEMPT]		// get preempt count
	// (1) 如果 preempt_count!=0，不进行可抢占判断
	cbnz	w24, 1f				// preempt count != 0
	ldr	x0, [tsk, #TI_FLAGS]		// get flags
	// (2) 如果 preempt_count==0 & TIF_NEED_RESCHED 被置位
	// 进行调度
	tbz	x0, #TIF_NEED_RESCHED, 1f	// needs rescheduling?
	bl	el1_preempt
1:
#endif
#ifdef CONFIG_MTPROF
	bl  MT_trace_hardirqs_on
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
	bl	trace_hardirqs_on
#endif
	kernel_exit 1
ENDPROC(el1_irq)

#ifdef CONFIG_PREEMPT
el1_preempt:
	mov	x24, lr
	// (3) 抢占调度
1:	bl	preempt_schedule_irq		// irq en/disable is done inside
	ldr	x0, [tsk, #TI_FLAGS]		// get new tasks TI_FLAGS
	tbnz	x0, #TIF_NEED_RESCHED, 1b	// needs rescheduling?
	ret	x24
#endif

| →

asmlinkage __visible void __sched preempt_schedule_irq(void)
{
	enum ctx_state prev_state;

	/* Catch callers which need to be fixed */
	BUG_ON(preempt_count() || !irqs_disabled());

	prev_state = exception_enter();

	do {
		__preempt_count_add(PREEMPT_ACTIVE);
		local_irq_enable();
		__schedule();
		local_irq_disable();
		__preempt_count_sub(PREEMPT_ACTIVE);

		/*
		 * Check again in case we missed a preemption opportunity
		 * between schedule and now.
		 */
		barrier();
	} while (need_resched());

	exception_exit(prev_state);
}

虽然 preempt_count>0 就是禁止抢占，Linux 进一步按照各种场景对 preempt_count bit 进行了资源划分：

reserved bits	bit21	bit20	bit19-bit16	bit15-bit8	bit7-bit0
	PREEMPT_ACTIVE	NMI	HARDIRQ	SOFTIRQ	PREEMPT

/*
 * PREEMPT_MASK:	0x000000ff
 * SOFTIRQ_MASK:	0x0000ff00
 * HARDIRQ_MASK:	0x000f0000
 *     NMI_MASK:	0x00100000
 * PREEMPT_ACTIVE:	0x00200000
 */
#define PREEMPT_BITS	8
#define SOFTIRQ_BITS	8
#define HARDIRQ_BITS	4
#define NMI_BITS	1

各场景分别利用各自的 bit 来 disable/enable 抢占：

普通场景 (PREEMPT_MASK)。对应函数 preempt_disable()、preempt_enable()。
软中断场景 (SOFTIRQ_MASK)。对应函数 local_bh_disable()、local_bh_enable()。
普通中断场景 (HARDIRQ_MASK)。对应函数 __irq_enter()、__irq_exit()。
NMI 中断场景 (NMI_MASK)。对应函数 nmi_enter()、nmi_exit()。

所以反过来，我们也可以通过 preempt_count 的值来判断当前在什么场景：

#define in_irq()		(hardirq_count())
#define in_softirq()		(softirq_count())
#define in_interrupt()		(irq_count())
#define in_serving_softirq()	(softirq_count() & SOFTIRQ_OFFSET)
#define in_nmi()	(preempt_count() & NMI_MASK)

#define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
#define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
#define irq_count()	(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
				 | NMI_MASK))

2.2 softirq

回到中断上下半部的架构，linux 系统虽然将大部分工作移出了中断上下文，不关闭中断。但是它也希望移出的工作能够很快的得到执行，软中断为了保证自己能很快执行，使用 __local_bh_disable_ip() 禁止抢占。

softirq 的具体实现机制如下：

系统支持固定的几种软中断，softirq_vec 数组用来记录这些软中断执行函数：

enum
{
	HI_SOFTIRQ=0,
	TIMER_SOFTIRQ,
	NET_TX_SOFTIRQ,
	NET_RX_SOFTIRQ,
	BLOCK_SOFTIRQ,
	BLOCK_IOPOLL_SOFTIRQ,
	TASKLET_SOFTIRQ,
	SCHED_SOFTIRQ,
	HRTIMER_SOFTIRQ,
	RCU_SOFTIRQ,    /* Preferable RCU should always be the last softirq */

	NR_SOFTIRQS
};

// 注册软中断的服务程序
void open_softirq(int nr, void (*action)(struct softirq_action *))
{
	softirq_vec[nr].action = action;
}

//TASKLET_SOFTIRQ、HI_SOFTIRQ 两个软中断用来给 tasklet 服务。
	open_softirq(TASKLET_SOFTIRQ, tasklet_action);
	open_softirq(HI_SOFTIRQ, tasklet_hi_action);

使用 irq_stat[cpu].__softirq_pending 来记录每个 cpu 上所有 softirq 的 pending 状态，raise_softirq() 用来置位一个 softirq pending：

void raise_softirq(unsigned int nr)
{
	unsigned long flags;

	local_irq_save(flags);
	raise_softirq_irqoff(nr);
	local_irq_restore(flags);
}
| →
inline void raise_softirq_irqoff(unsigned int nr)
{
	__raise_softirq_irqoff(nr);

	if (!in_interrupt())
		wakeup_softirqd();
}
|| →
void __raise_softirq_irqoff(unsigned int nr)
{
	trace_softirq_raise(nr);
	or_softirq_pending(1UL << nr);
}
||| →
#define or_softirq_pending(x)  (local_softirq_pending() |= (x))

#ifndef __ARCH_IRQ_STAT
extern irq_cpustat_t irq_stat[];		/* defined in asm/hardirq.h */
#define __IRQ_STAT(cpu, member)	(irq_stat[cpu].member)
#endif

  /* arch independent irq_stat fields */
#define local_softirq_pending() \
	__IRQ_STAT(smp_processor_id(), __softirq_pending)

softirq 的执行有两个时刻：在退出中断 irq_exit() 时或者在 softirqd 线程当中：

linux 软中断流程

软中断使用 smpboot_register_percpu_thread() 函数，给每个 cpu 上创建了对应的 softirqd 线程：

root@:/ # ps | grep softirq
root      3     2     0      0     smpboot_th 0000000000 S ksoftirqd/0
root      12    2     0      0     __kthread_ 0000000000 R ksoftirqd/1
root      16    2     0      0     __kthread_ 0000000000 R ksoftirqd/2
root      20    2     0      0     __kthread_ 0000000000 R ksoftirqd/3
root      24    2     0      0     __kthread_ 0000000000 R ksoftirqd/4
root      28    2     0      0     __kthread_ 0000000000 R ksoftirqd/5
root      32    2     0      0     __kthread_ 0000000000 R ksoftirqd/6
root      36    2     0      0     __kthread_ 0000000000 R ksoftirqd/7

软中断优先在 irq_exit() 中执行，如果超过时间等条件转为 softirqd 线程中执行。满足以下任一条件软中断在 softirqd 线程中执行：

在 irq_exit()->__do_softirq() 中运行，时间超过 2ms。
在 irq_exit()->__do_softirq() 中运行，轮询软中断超过 10 次。
在 irq_exit()->__do_softirq() 中运行，本线程需要被调度。
调用 raise_softirq() 唤醒软中断时，不在中断环境中。

我们也看到，软中断处理是按照优先级逐个调用 softirq_vec[] 数组中的软中断处理函数，所以前面的软中断是可以阻塞后面的软中断的。这个在我们写程序的时候需要注意。

2.3 tasklet

Linux 已经有了 softirq 机制，为什么还需要 tasklet 机制？最主要的原因是 softirq 是多 cpu 执行的，可能碰到很多重入的问题，而 tasklet 同一时刻只能在一个 cpu 上执行，不需要处理重入互斥问题。另外 Linux 也不建议用户去添加新的软中断。

下面我们来具体分析一下 tasklet 的实现机制：

per-cpu 变量 tasklet_vec/tasklet_hi_vec 以链表的形式记录了当前 cpu 需要处理的 tasklet 任务：

void __init softirq_init(void)
{
	int cpu;

	for_each_possible_cpu(cpu) {
		// (1)tasklet_vec 为低优先级的 tasklet 链表
		per_cpu(tasklet_vec, cpu).tail =
			&per_cpu(tasklet_vec, cpu).head;
		// (2)tasklet_hi_vec 为高优先级的 tasklet 链表
		per_cpu(tasklet_hi_vec, cpu).tail =
			&per_cpu(tasklet_hi_vec, cpu).head;
	}
}

push 一个 tasklet 任务：

static inline void tasklet_schedule(struct tasklet_struct *t)
{
	if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state))
		__tasklet_schedule(t);
}
| →
void __tasklet_schedule(struct tasklet_struct *t)
{
	unsigned long flags;

	local_irq_save(flags);
	// (1) 将新的 tasklet 插入到本 cpu 链表尾部
	t->next = NULL;
	*__this_cpu_read(tasklet_vec.tail) = t;
	__this_cpu_write(tasklet_vec.tail, &(t->next));
	// (2)raise 软中断来处理 tasklet
	raise_softirq_irqoff(TASKLET_SOFTIRQ);
	local_irq_restore(flags);
}

处理一个 tasklet 任务：

static void tasklet_action(struct softirq_action *a)
{
	struct tasklet_struct *list;

	local_irq_disable();
	// (1)list 取出当前链表中所有已有的 tasklet
	list = __this_cpu_read(tasklet_vec.head);
	// (2)tasklet_vec.head 和 tasklet_vec.tail 返回初始化状态，继续接收新的 tasklet
	__this_cpu_write(tasklet_vec.head, NULL);
	__this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
	local_irq_enable();

	// (3) 逐个处理取出的 list 链表中的 tasklet
	while (list) {
		struct tasklet_struct *t = list;

		list = list->next;

		// (4)tasklet 互斥锁，保证 tasklet 同时只能在一个 cpu 上执行
		if (tasklet_trylock(t)) {
			if (!atomic_read(&t->count)) {

				// (6) 在 tasklet 运行前清除 TASKLET_STATE_SCHED 标志
				// 这个时候 tasklet 可以重新加入新的队列了，但是还不能执行
				if (!test_and_clear_bit(TASKLET_STATE_SCHED,
							&t->state))
					BUG();

				// (7) 执行实际的 tasklet 处理函数
				t->func(t->data);

				// (8) 释放 tasklet 锁，其他 cpu 可以运行这个 tasklet 了
				tasklet_unlock(t);
				continue;
			}
			tasklet_unlock(t);
		}

		local_irq_disable();
		// (5) 如果获取 tasklet 互斥锁失败，先加入到 cpu tasklet_vec 链表中
		// 下次执行
		t->next = NULL;
		*__this_cpu_read(tasklet_vec.tail) = t;
		__this_cpu_write(tasklet_vec.tail, &(t->next));
		__raise_softirq_irqoff(TASKLET_SOFTIRQ);
		local_irq_enable();
	}
}

参考资料

^ARMPG. ARM Cortex-A Series Programmer’s Guide for ARMv8-A ↩

^GICANALY. GIC 代码分析 ↩