]> git.neil.brown.name Git - history.git/commitdiff
[PATCH] Hotplug CPUs: Other CPU_DEAD Notifiers
authorRusty Russell <rusty@rustcorp.com.au>
Fri, 19 Mar 2004 00:04:13 +0000 (16:04 -0800)
committerLinus Torvalds <torvalds@ppc970.osdl.org>
Fri, 19 Mar 2004 00:04:13 +0000 (16:04 -0800)
Various files keep per-cpu caches which need to be freed/moved when a
CPU goes down.  All under CONFIG_HOTPLUG_CPU ifdefs.

scsi.c: drain dead cpu's scsi_done_q onto this cpu.

buffer.c: brelse the bh_lrus queue for dead cpu.

timer.c: migrate timers from dead cpu, being careful of lock order vs
__mod_timer.

radix_tree.c: free dead cpu's radix_tree_preloads

page_alloc.c: empty dead cpu's nr_pagecache_local into nr_pagecache, and
free pages on cpu's local cache.

slab.c: stop reap_timer for dead cpu, adjust each cache's free limit, and
free each slab cache's per-cpu block.

swap.c: drain dead cpu's lru_add_pvecs into ours, and empty its committed_space
counter into global counter.

dev.c: drain device queues from dead cpu into this one.

flow.c: drain dead cpu's flow cache.

drivers/scsi/scsi.c
fs/buffer.c
kernel/timer.c
lib/radix-tree.c
mm/page_alloc.c
mm/slab.c
mm/swap.c
net/core/dev.c
net/core/flow.c

index a7e43523cfde1df6c4080f4fdc50b754219f4922..25e54a7bd824dcd4c2160a8129c9527cbf5195b5 100644 (file)
@@ -53,6 +53,8 @@
 #include <linux/spinlock.h>
 #include <linux/kmod.h>
 #include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
 
 #include <scsi/scsi_host.h>
 #include "scsi.h"
@@ -1130,6 +1132,38 @@ int scsi_device_cancel(struct scsi_device *sdev, int recovery)
        return 0;
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
+static int scsi_cpu_notify(struct notifier_block *self,
+                          unsigned long action, void *hcpu)
+{
+       int cpu = (unsigned long)hcpu;
+
+       switch(action) {
+       case CPU_DEAD:
+               /* Drain scsi_done_q. */
+               local_irq_disable();
+               list_splice_init(&per_cpu(scsi_done_q, cpu),
+                                &__get_cpu_var(scsi_done_q));
+               raise_softirq_irqoff(SCSI_SOFTIRQ);
+               local_irq_enable();
+               break;
+       default:
+               break;
+       }
+       return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata scsi_cpu_nb = {
+       .notifier_call  = scsi_cpu_notify,
+};
+
+#define register_scsi_cpu() register_cpu_notifier(&scsi_cpu_nb)
+#define unregister_scsi_cpu() unregister_cpu_notifier(&scsi_cpu_nb)
+#else
+#define register_scsi_cpu()
+#define unregister_scsi_cpu()
+#endif /* CONFIG_HOTPLUG_CPU */
+
 MODULE_DESCRIPTION("SCSI core");
 MODULE_LICENSE("GPL");
 
@@ -1164,6 +1198,7 @@ static int __init init_scsi(void)
 
        devfs_mk_dir("scsi");
        open_softirq(SCSI_SOFTIRQ, scsi_softirq, NULL);
+       register_scsi_cpu();
        printk(KERN_NOTICE "SCSI subsystem initialized\n");
        return 0;
 
@@ -1191,6 +1226,7 @@ static void __exit exit_scsi(void)
        devfs_remove("scsi");
        scsi_exit_procfs();
        scsi_exit_queue();
+       unregister_scsi_cpu();
 }
 
 subsys_initcall(init_scsi);
index c0a4c81ff4c9397b7cbd70c0eba99c9c0460a040..3e8cc0eeed9c5d139ed793edcfd163a79f9bb0ad 100644 (file)
@@ -3024,6 +3024,26 @@ init_buffer_head(void *data, kmem_cache_t *cachep, unsigned long flags)
        }
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
+static void buffer_exit_cpu(int cpu)
+{
+       int i;
+       struct bh_lru *b = &per_cpu(bh_lrus, cpu);
+
+       for (i = 0; i < BH_LRU_SIZE; i++) {
+               brelse(b->bhs[i]);
+               b->bhs[i] = NULL;
+       }
+}
+
+static int buffer_cpu_notify(struct notifier_block *self,
+                             unsigned long action, void *hcpu)
+{
+       if (action == CPU_DEAD)
+               buffer_exit_cpu((unsigned long)hcpu);
+       return NOTIFY_OK;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
 
 void __init buffer_init(void)
 {
@@ -3041,6 +3061,7 @@ void __init buffer_init(void)
         */
        nrpages = (nr_free_buffer_pages() * 10) / 100;
        max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
+       hotcpu_notifier(buffer_cpu_notify, 0);
 }
 
 EXPORT_SYMBOL(__bforget);
index 43bb91647fc33fbc58ea607cc5cd1f092b54f684..edef76c85c5d9a0f69400d8ff72b7d62cebdad1f 100644 (file)
@@ -1222,7 +1222,73 @@ static void __devinit init_timers_cpu(int cpu)
 
        base->timer_jiffies = jiffies;
 }
-       
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
+{
+       struct timer_list *timer;
+
+       while (!list_empty(head)) {
+               timer = list_entry(head->next, struct timer_list, entry);
+               /* We're locking backwards from __mod_timer order here,
+                  beware deadlock. */
+               if (!spin_trylock(&timer->lock))
+                       return 0;
+               list_del(&timer->entry);
+               internal_add_timer(new_base, timer);
+               timer->base = new_base;
+               spin_unlock(&timer->lock);
+       }
+       return 1;
+}
+
+static void __devinit migrate_timers(int cpu)
+{
+       tvec_base_t *old_base;
+       tvec_base_t *new_base;
+       int i;
+
+       BUG_ON(cpu_online(cpu));
+       old_base = &per_cpu(tvec_bases, cpu);
+       new_base = &get_cpu_var(tvec_bases);
+
+       local_irq_disable();
+again:
+       /* Prevent deadlocks via ordering by old_base < new_base. */
+       if (old_base < new_base) {
+               spin_lock(&new_base->lock);
+               spin_lock(&old_base->lock);
+       } else {
+               spin_lock(&old_base->lock);
+               spin_lock(&new_base->lock);
+       }
+
+       if (old_base->running_timer)
+               BUG();
+       for (i = 0; i < TVR_SIZE; i++)
+               if (!migrate_timer_list(new_base, old_base->tv1.vec + i))
+                       goto unlock_again;
+       for (i = 0; i < TVN_SIZE; i++)
+               if (!migrate_timer_list(new_base, old_base->tv2.vec + i)
+                   || !migrate_timer_list(new_base, old_base->tv3.vec + i)
+                   || !migrate_timer_list(new_base, old_base->tv4.vec + i)
+                   || !migrate_timer_list(new_base, old_base->tv5.vec + i))
+                       goto unlock_again;
+       spin_unlock(&old_base->lock);
+       spin_unlock(&new_base->lock);
+       local_irq_enable();
+       put_cpu_var(tvec_bases);
+       return;
+
+unlock_again:
+       /* Avoid deadlock with __mod_timer, by backing off. */
+       spin_unlock(&old_base->lock);
+       spin_unlock(&new_base->lock);
+       cpu_relax();
+       goto again;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
 static int __devinit timer_cpu_notify(struct notifier_block *self, 
                                unsigned long action, void *hcpu)
 {
@@ -1231,6 +1297,11 @@ static int __devinit timer_cpu_notify(struct notifier_block *self,
        case CPU_UP_PREPARE:
                init_timers_cpu(cpu);
                break;
+#ifdef CONFIG_HOTPLUG_CPU
+       case CPU_DEAD:
+               migrate_timers(cpu);
+               break;
+#endif
        default:
                break;
        }
index cdfb8c6a97f1d540162559d4111ed1beaa4410fc..70ad32ff37ca9c4fbaead0ec60d30822d513127f 100644 (file)
@@ -24,6 +24,8 @@
 #include <linux/radix-tree.h>
 #include <linux/percpu.h>
 #include <linux/slab.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
 #include <linux/gfp.h>
 #include <linux/string.h>
 
@@ -420,6 +422,28 @@ static __init void radix_tree_init_maxindex(void)
                height_to_maxindex[i] = __maxindex(i);
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
+static int radix_tree_callback(struct notifier_block *nfb,
+                            unsigned long action,
+                            void *hcpu)
+{
+       int cpu = (long)hcpu;
+       struct radix_tree_preload *rtp;
+
+       /* Free per-cpu pool of perloaded nodes */
+       if (action == CPU_DEAD) {
+               rtp = &per_cpu(radix_tree_preloads, cpu);
+               while (rtp->nr) {
+                       kmem_cache_free(radix_tree_node_cachep,
+                                       rtp->nodes[rtp->nr-1]);
+                       rtp->nodes[rtp->nr-1] = NULL;
+                       rtp->nr--;
+               }
+       }
+       return NOTIFY_OK;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
 void __init radix_tree_init(void)
 {
        radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
@@ -428,4 +452,5 @@ void __init radix_tree_init(void)
        if (!radix_tree_node_cachep)
                panic ("Failed to create radix_tree_node cache\n");
        radix_tree_init_maxindex();
+       hotcpu_notifier(radix_tree_callback, 0);
 }
index 0b6923050c005de2d2fe23c1afa8bb45d4f5f4fd..dbdd232a49e094f60f8a368c748154a83064894d 100644 (file)
@@ -1716,9 +1716,29 @@ struct seq_operations vmstat_op = {
 
 #endif /* CONFIG_PROC_FS */
 
+#ifdef CONFIG_HOTPLUG_CPU
+static int page_alloc_cpu_notify(struct notifier_block *self,
+                                unsigned long action, void *hcpu)
+{
+       int cpu = (unsigned long)hcpu;
+       long *count;
+
+       if (action == CPU_DEAD) {
+               /* Drain local pagecache count. */
+               count = &per_cpu(nr_pagecache_local, cpu);
+               atomic_add(*count, &nr_pagecache);
+               *count = 0;
+               local_irq_disable();
+               __drain_pages(cpu);
+               local_irq_enable();
+       }
+       return NOTIFY_OK;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
 
 void __init page_alloc_init(void)
 {
+       hotcpu_notifier(page_alloc_cpu_notify, 0);
 }
 
 /*
index d9f7c4be5102b3f92f87be85b7ef30ebf0260ef9..fd0ab945f8c2c52d9516c2ec4ce67b4d8462de4d 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -589,12 +589,19 @@ static void __init start_cpu_timer(int cpu)
        }
 }
 
-/*
- * Note: if someone calls kmem_cache_alloc() on the new
- * cpu before the cpuup callback had a chance to allocate
- * the head arrays, it will oops.
- * Is CPU_ONLINE early enough?
- */
+#ifdef CONFIG_HOTPLUG_CPU
+static void stop_cpu_timer(int cpu)
+{
+       struct timer_list *rt = &per_cpu(reap_timers, cpu);
+
+       if (rt->function) {
+               del_timer_sync(rt);
+               WARN_ON(timer_pending(rt));
+               rt->function = NULL;
+       }
+}
+#endif
+
 static int __devinit cpuup_callback(struct notifier_block *nfb,
                                  unsigned long action,
                                  void *hcpu)
@@ -630,18 +637,28 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
        case CPU_ONLINE:
                start_cpu_timer(cpu);
                break;
+#ifdef CONFIG_HOTPLUG_CPU
+       case CPU_DEAD:
+               stop_cpu_timer(cpu);
+               /* fall thru */
        case CPU_UP_CANCELED:
                down(&cache_chain_sem);
 
                list_for_each_entry(cachep, &cache_chain, next) {
                        struct array_cache *nc;
 
+                       spin_lock_irq(&cachep->spinlock);
+                       /* cpu is dead; no one can alloc from it. */
                        nc = cachep->array[cpu];
                        cachep->array[cpu] = NULL;
+                       cachep->free_limit -= cachep->batchcount;
+                       free_block(cachep, ac_entry(nc), nc->avail);
+                       spin_unlock_irq(&cachep->spinlock);
                        kfree(nc);
                }
                up(&cache_chain_sem);
                break;
+#endif
        }
        return NOTIFY_OK;
 bad:
@@ -1486,6 +1503,9 @@ int kmem_cache_destroy (kmem_cache_t * cachep)
                return 1;
        }
 
+       /* no cpu_online check required here since we clear the percpu
+        * array on cpu offline and set this to NULL.
+        */
        for (i = 0; i < NR_CPUS; i++)
                kfree(cachep->array[i]);
 
index 2f0cb0a78852a61e6cc4e60a790a03e44b0cb882..a5352c98751a4b2bfcebdd6d3b24e54e5f1938ae 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -27,6 +27,9 @@
 #include <linux/module.h>
 #include <linux/percpu_counter.h>
 #include <linux/percpu.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+#include <linux/init.h>
 
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
@@ -381,7 +384,37 @@ void vm_acct_memory(long pages)
        preempt_enable();
 }
 EXPORT_SYMBOL(vm_acct_memory);
-#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void lru_drain_cache(unsigned int cpu)
+{
+       struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
+
+       /* CPU is dead, so no locking needed. */
+       if (pagevec_count(pvec))
+               __pagevec_lru_add(pvec);
+       pvec = &per_cpu(lru_add_active_pvecs, cpu);
+       if (pagevec_count(pvec))
+               __pagevec_lru_add_active(pvec);
+}
+
+/* Drop the CPU's cached committed space back into the central pool. */
+static int cpu_swap_callback(struct notifier_block *nfb,
+                            unsigned long action,
+                            void *hcpu)
+{
+       long *committed;
+
+       committed = &per_cpu(committed_space, (long)hcpu);
+       if (action == CPU_DEAD) {
+               atomic_add(*committed, &vm_committed_space);
+               *committed = 0;
+               lru_drain_cache((long)hcpu);
+       }
+       return NOTIFY_OK;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+#endif /* CONFIG_SMP */
 
 #ifdef CONFIG_SMP
 void percpu_counter_mod(struct percpu_counter *fbc, long amount)
@@ -420,4 +453,5 @@ void __init swap_setup(void)
         * Right now other parts of the system means that we
         * _really_ don't want to cluster much more
         */
+       hotcpu_notifier(cpu_swap_callback, 0);
 }
index b4418fb5eb2589547be66343c7455dc54af51361..201878db9853c67c9aba6744922cde861fdb8035 100644 (file)
@@ -76,6 +76,7 @@
 #include <asm/system.h>
 #include <asm/bitops.h>
 #include <linux/config.h>
+#include <linux/cpu.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
@@ -3131,6 +3132,52 @@ int unregister_netdevice(struct net_device *dev)
        return 0;
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
+static int dev_cpu_callback(struct notifier_block *nfb,
+                           unsigned long action,
+                           void *ocpu)
+{
+       struct sk_buff **list_skb;
+       struct net_device **list_net;
+       struct sk_buff *skb;
+       unsigned int cpu, oldcpu = (unsigned long)ocpu;
+       struct softnet_data *sd, *oldsd;
+
+       if (action != CPU_DEAD)
+               return NOTIFY_OK;
+
+       local_irq_disable();
+       cpu = smp_processor_id();
+       sd = &per_cpu(softnet_data, cpu);
+       oldsd = &per_cpu(softnet_data, oldcpu);
+
+       /* Find end of our completion_queue. */
+       list_skb = &sd->completion_queue;
+       while (*list_skb)
+               list_skb = &(*list_skb)->next;
+       /* Append completion queue from offline CPU. */
+       *list_skb = oldsd->completion_queue;
+       oldsd->completion_queue = NULL;
+
+       /* Find end of our output_queue. */
+       list_net = &sd->output_queue;
+       while (*list_net)
+               list_net = &(*list_net)->next_sched;
+       /* Append output queue from offline CPU. */
+       *list_net = oldsd->output_queue;
+       oldsd->output_queue = NULL;
+
+       raise_softirq_irqoff(NET_TX_SOFTIRQ);
+       local_irq_enable();
+
+       /* Process offline CPU's input_pkt_queue */
+       while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
+               netif_rx(skb);
+
+       return NOTIFY_OK;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
 
 /*
  *     Initialize the DEV module. At boot time this walks the device list and
@@ -3195,6 +3242,7 @@ static int __init net_dev_init(void)
        open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
        open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
 
+       hotcpu_notifier(dev_cpu_callback, 0);
        dst_init();
        dev_mcast_init();
        rc = 0;
index cef037afdf41230ae42cc0efae26b45a0abff8d7..f289570b15a3b3e3ba442d14ca6e133a8dff575e 100644 (file)
@@ -326,6 +326,17 @@ static void __devinit flow_cache_cpu_prepare(int cpu)
        tasklet_init(tasklet, flow_cache_flush_tasklet, 0);
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
+static int flow_cache_cpu(struct notifier_block *nfb,
+                         unsigned long action,
+                         void *hcpu)
+{
+       if (action == CPU_DEAD)
+               __flow_cache_shrink((unsigned long)hcpu, 0);
+       return NOTIFY_OK;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
 static int __init flow_cache_init(void)
 {
        int i;
@@ -350,6 +361,7 @@ static int __init flow_cache_init(void)
        for_each_cpu(i)
                flow_cache_cpu_prepare(i);
 
+       hotcpu_notifier(flow_cache_cpu, 0);
        return 0;
 }