]> pilppa.com Git - linux-2.6-omap-h63xx.git/commitdiff
[DLM] The core of the DLM for GFS2/CLVM
authorDavid Teigland <teigland@redhat.com>
Wed, 18 Jan 2006 09:30:29 +0000 (09:30 +0000)
committerSteven Whitehouse <swhiteho@redhat.com>
Wed, 18 Jan 2006 09:30:29 +0000 (09:30 +0000)
This is the core of the distributed lock manager which is required
to use GFS2 as a cluster filesystem. It is also used by CLVM and
can be used as a standalone lock manager independantly of either
of these two projects.

It implements VAX-style locking modes.

Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Steve Whitehouse <swhiteho@redhat.com>
39 files changed:
fs/Kconfig
fs/Makefile
fs/dlm/Kconfig [new file with mode: 0644]
fs/dlm/Makefile [new file with mode: 0644]
fs/dlm/ast.c [new file with mode: 0644]
fs/dlm/ast.h [new file with mode: 0644]
fs/dlm/config.c [new file with mode: 0644]
fs/dlm/config.h [new file with mode: 0644]
fs/dlm/debug_fs.c [new file with mode: 0644]
fs/dlm/device.c [new file with mode: 0644]
fs/dlm/dir.c [new file with mode: 0644]
fs/dlm/dir.h [new file with mode: 0644]
fs/dlm/dlm_internal.h [new file with mode: 0644]
fs/dlm/lock.c [new file with mode: 0644]
fs/dlm/lock.h [new file with mode: 0644]
fs/dlm/lockspace.c [new file with mode: 0644]
fs/dlm/lockspace.h [new file with mode: 0644]
fs/dlm/lowcomms.c [new file with mode: 0644]
fs/dlm/lowcomms.h [new file with mode: 0644]
fs/dlm/lvb_table.h [new file with mode: 0644]
fs/dlm/main.c [new file with mode: 0644]
fs/dlm/member.c [new file with mode: 0644]
fs/dlm/member.h [new file with mode: 0644]
fs/dlm/memory.c [new file with mode: 0644]
fs/dlm/memory.h [new file with mode: 0644]
fs/dlm/midcomms.c [new file with mode: 0644]
fs/dlm/midcomms.h [new file with mode: 0644]
fs/dlm/rcom.c [new file with mode: 0644]
fs/dlm/rcom.h [new file with mode: 0644]
fs/dlm/recover.c [new file with mode: 0644]
fs/dlm/recover.h [new file with mode: 0644]
fs/dlm/recoverd.c [new file with mode: 0644]
fs/dlm/recoverd.h [new file with mode: 0644]
fs/dlm/requestqueue.c [new file with mode: 0644]
fs/dlm/requestqueue.h [new file with mode: 0644]
fs/dlm/util.c [new file with mode: 0644]
fs/dlm/util.h [new file with mode: 0644]
include/linux/dlm.h [new file with mode: 0644]
include/linux/dlm_device.h [new file with mode: 0644]

index bec8afa5704d3cc45be5591bc9393b107cd0fc34..3a32f3f97e9bda68e2bdb42bdee3fea25f1f39a2 100644 (file)
@@ -1831,6 +1831,7 @@ source "fs/partitions/Kconfig"
 endmenu
 
 source "fs/nls/Kconfig"
+source "fs/dlm/Kconfig"
 
 endmenu
 
index 0922727732c5655b1b9bc5f9c4294744db8d918e..b298f4fdc6f2faa7d6f56eb294923d92718dd346 100644 (file)
@@ -48,6 +48,7 @@ obj-$(CONFIG_SYSFS)           += sysfs/
 obj-y                          += devpts/
 
 obj-$(CONFIG_PROFILING)                += dcookies.o
+obj-$(CONFIG_DLM)              += dlm/
  
 # Do not add any filesystems before this line
 obj-$(CONFIG_REISERFS_FS)      += reiserfs/
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
new file mode 100644 (file)
index 0000000..d01f735
--- /dev/null
@@ -0,0 +1,30 @@
+menu "Distributed Lock Manager"
+       depends on INET && EXPERIMENTAL
+
+config DLM
+       tristate "Distributed Lock Manager (DLM)"
+       depends on SYSFS
+       depends on IPV6 || IPV6=n
+       select IP_SCTP
+       select CONFIGFS_FS
+       help
+       A general purpose distributed lock manager for kernel or userspace
+       applications.
+
+config DLM_DEVICE
+       tristate "DLM device for userspace access"
+       depends on DLM
+       help
+       This module creates a misc device through which the dlm lockspace
+       and locking functions become available to userspace applications
+       (usually through the libdlm library).
+
+config DLM_DEBUG
+       bool "DLM debugging"
+       depends on DLM
+       help
+       Under the debugfs mount point, the name of each lockspace will
+       appear as a file in the "dlm" directory.  The output is the
+       list of resource and locks the local node knows about.
+
+endmenu
diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile
new file mode 100644 (file)
index 0000000..1e6232e
--- /dev/null
@@ -0,0 +1,21 @@
+obj-$(CONFIG_DLM) +=           dlm.o
+obj-$(CONFIG_DLM_DEVICE) +=    dlm_device.o
+
+dlm-y :=                       ast.o \
+                               config.o \
+                               dir.o \
+                               lock.o \
+                               lockspace.o \
+                               lowcomms.o \
+                               main.o \
+                               member.o \
+                               memory.o \
+                               midcomms.o \
+                               rcom.o \
+                               recover.o \
+                               recoverd.o \
+                               requestqueue.o \
+                               util.o
+dlm-$(CONFIG_DLM_DEBUG) +=     debug_fs.o
+
+dlm_device-y :=                        device.o
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
new file mode 100644 (file)
index 0000000..2bd1c5e
--- /dev/null
@@ -0,0 +1,167 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lock.h"
+#include "ast.h"
+
+#define WAKE_ASTS  0
+
+static struct list_head                ast_queue;
+static spinlock_t              ast_queue_lock;
+static struct task_struct *    astd_task;
+static unsigned long           astd_wakeflags;
+static struct semaphore                astd_running;
+
+
+void dlm_del_ast(struct dlm_lkb *lkb)
+{
+       spin_lock(&ast_queue_lock);
+       if (lkb->lkb_ast_type & (AST_COMP | AST_BAST))
+               list_del(&lkb->lkb_astqueue);
+       spin_unlock(&ast_queue_lock);
+}
+
+void dlm_add_ast(struct dlm_lkb *lkb, int type)
+{
+       spin_lock(&ast_queue_lock);
+       if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
+               kref_get(&lkb->lkb_ref);
+               list_add_tail(&lkb->lkb_astqueue, &ast_queue);
+       }
+       lkb->lkb_ast_type |= type;
+       spin_unlock(&ast_queue_lock);
+
+       set_bit(WAKE_ASTS, &astd_wakeflags);
+       wake_up_process(astd_task);
+}
+
+static void process_asts(void)
+{
+       struct dlm_ls *ls = NULL;
+       struct dlm_rsb *r = NULL;
+       struct dlm_lkb *lkb;
+       void (*cast) (long param);
+       void (*bast) (long param, int mode);
+       int type = 0, found, bmode;
+
+       for (;;) {
+               found = FALSE;
+               spin_lock(&ast_queue_lock);
+               list_for_each_entry(lkb, &ast_queue, lkb_astqueue) {
+                       r = lkb->lkb_resource;
+                       ls = r->res_ls;
+
+                       if (dlm_locking_stopped(ls))
+                               continue;
+
+                       list_del(&lkb->lkb_astqueue);
+                       type = lkb->lkb_ast_type;
+                       lkb->lkb_ast_type = 0;
+                       found = TRUE;
+                       break;
+               }
+               spin_unlock(&ast_queue_lock);
+
+               if (!found)
+                       break;
+
+               cast = lkb->lkb_astaddr;
+               bast = lkb->lkb_bastaddr;
+               bmode = lkb->lkb_bastmode;
+
+               if ((type & AST_COMP) && cast)
+                       cast(lkb->lkb_astparam);
+
+               /* FIXME: Is it safe to look at lkb_grmode here
+                  without doing a lock_rsb() ?
+                  Look at other checks in v1 to avoid basts. */
+
+               if ((type & AST_BAST) && bast)
+                       if (!dlm_modes_compat(lkb->lkb_grmode, bmode))
+                               bast(lkb->lkb_astparam, bmode);
+
+               /* this removes the reference added by dlm_add_ast
+                  and may result in the lkb being freed */
+               dlm_put_lkb(lkb);
+
+               schedule();
+       }
+}
+
+static inline int no_asts(void)
+{
+       int ret;
+
+       spin_lock(&ast_queue_lock);
+       ret = list_empty(&ast_queue);
+       spin_unlock(&ast_queue_lock);
+       return ret;
+}
+
+static int dlm_astd(void *data)
+{
+       while (!kthread_should_stop()) {
+               set_current_state(TASK_INTERRUPTIBLE);
+               if (!test_bit(WAKE_ASTS, &astd_wakeflags))
+                       schedule();
+               set_current_state(TASK_RUNNING);
+
+               down(&astd_running);
+               if (test_and_clear_bit(WAKE_ASTS, &astd_wakeflags))
+                       process_asts();
+               up(&astd_running);
+       }
+       return 0;
+}
+
+void dlm_astd_wake(void)
+{
+       if (!no_asts()) {
+               set_bit(WAKE_ASTS, &astd_wakeflags);
+               wake_up_process(astd_task);
+       }
+}
+
+int dlm_astd_start(void)
+{
+       struct task_struct *p;
+       int error = 0;
+
+       INIT_LIST_HEAD(&ast_queue);
+       spin_lock_init(&ast_queue_lock);
+       init_MUTEX(&astd_running);
+
+       p = kthread_run(dlm_astd, NULL, "dlm_astd");
+       if (IS_ERR(p))
+               error = PTR_ERR(p);
+       else
+               astd_task = p;
+       return error;
+}
+
+void dlm_astd_stop(void)
+{
+       kthread_stop(astd_task);
+}
+
+void dlm_astd_suspend(void)
+{
+       down(&astd_running);
+}
+
+void dlm_astd_resume(void)
+{
+       up(&astd_running);
+}
+
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
new file mode 100644 (file)
index 0000000..6ee276c
--- /dev/null
@@ -0,0 +1,26 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __ASTD_DOT_H__
+#define __ASTD_DOT_H__
+
+void dlm_add_ast(struct dlm_lkb *lkb, int type);
+void dlm_del_ast(struct dlm_lkb *lkb);
+
+void dlm_astd_wake(void);
+int dlm_astd_start(void);
+void dlm_astd_stop(void);
+void dlm_astd_suspend(void);
+void dlm_astd_resume(void);
+
+#endif
+
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
new file mode 100644 (file)
index 0000000..024ace9
--- /dev/null
@@ -0,0 +1,787 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/configfs.h>
+#include <net/sock.h>
+
+#include "config.h"
+
+/*
+ * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/nodeid
+ * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/weight
+ * /config/dlm/<cluster>/comms/<comm>/nodeid
+ * /config/dlm/<cluster>/comms/<comm>/local
+ * /config/dlm/<cluster>/comms/<comm>/addr
+ * The <cluster> level is useless, but I haven't figured out how to avoid it.
+ */
+
+static struct config_group *space_list;
+static struct config_group *comm_list;
+static struct comm *local_comm;
+
+struct clusters;
+struct cluster;
+struct spaces;
+struct space;
+struct comms;
+struct comm;
+struct nodes;
+struct node;
+
+static struct config_group *make_cluster(struct config_group *, const char *);
+static void drop_cluster(struct config_group *, struct config_item *);
+static void release_cluster(struct config_item *);
+static struct config_group *make_space(struct config_group *, const char *);
+static void drop_space(struct config_group *, struct config_item *);
+static void release_space(struct config_item *);
+static struct config_item *make_comm(struct config_group *, const char *);
+static void drop_comm(struct config_group *, struct config_item *);
+static void release_comm(struct config_item *);
+static struct config_item *make_node(struct config_group *, const char *);
+static void drop_node(struct config_group *, struct config_item *);
+static void release_node(struct config_item *);
+
+static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
+                        char *buf);
+static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
+                         const char *buf, size_t len);
+static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
+                        char *buf);
+static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
+                         const char *buf, size_t len);
+
+static ssize_t comm_nodeid_read(struct comm *cm, char *buf);
+static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len);
+static ssize_t comm_local_read(struct comm *cm, char *buf);
+static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len);
+static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len);
+static ssize_t node_nodeid_read(struct node *nd, char *buf);
+static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len);
+static ssize_t node_weight_read(struct node *nd, char *buf);
+static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len);
+
+enum {
+       COMM_ATTR_NODEID = 0,
+       COMM_ATTR_LOCAL,
+       COMM_ATTR_ADDR,
+};
+
+struct comm_attribute {
+       struct configfs_attribute attr;
+       ssize_t (*show)(struct comm *, char *);
+       ssize_t (*store)(struct comm *, const char *, size_t);
+};
+
+static struct comm_attribute comm_attr_nodeid = {
+       .attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "nodeid",
+                    .ca_mode = S_IRUGO | S_IWUSR },
+       .show   = comm_nodeid_read,
+       .store  = comm_nodeid_write,
+};
+
+static struct comm_attribute comm_attr_local = {
+       .attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "local",
+                    .ca_mode = S_IRUGO | S_IWUSR },
+       .show   = comm_local_read,
+       .store  = comm_local_write,
+};
+
+static struct comm_attribute comm_attr_addr = {
+       .attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "addr",
+                    .ca_mode = S_IRUGO | S_IWUSR },
+       .store  = comm_addr_write,
+};
+
+static struct configfs_attribute *comm_attrs[] = {
+       [COMM_ATTR_NODEID] = &comm_attr_nodeid.attr,
+       [COMM_ATTR_LOCAL] = &comm_attr_local.attr,
+       [COMM_ATTR_ADDR] = &comm_attr_addr.attr,
+       NULL,
+};
+
+enum {
+       NODE_ATTR_NODEID = 0,
+       NODE_ATTR_WEIGHT,
+};
+
+struct node_attribute {
+       struct configfs_attribute attr;
+       ssize_t (*show)(struct node *, char *);
+       ssize_t (*store)(struct node *, const char *, size_t);
+};
+
+static struct node_attribute node_attr_nodeid = {
+       .attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "nodeid",
+                    .ca_mode = S_IRUGO | S_IWUSR },
+       .show   = node_nodeid_read,
+       .store  = node_nodeid_write,
+};
+
+static struct node_attribute node_attr_weight = {
+       .attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "weight",
+                    .ca_mode = S_IRUGO | S_IWUSR },
+       .show   = node_weight_read,
+       .store  = node_weight_write,
+};
+
+static struct configfs_attribute *node_attrs[] = {
+       [NODE_ATTR_NODEID] = &node_attr_nodeid.attr,
+       [NODE_ATTR_WEIGHT] = &node_attr_weight.attr,
+       NULL,
+};
+
+struct clusters {
+       struct configfs_subsystem subsys;
+};
+
+struct cluster {
+       struct config_group group;
+};
+
+struct spaces {
+       struct config_group ss_group;
+};
+
+struct space {
+       struct config_group group;
+       struct list_head members;
+       struct semaphore members_lock;
+       int members_count;
+};
+
+struct comms {
+       struct config_group cs_group;
+};
+
+struct comm {
+       struct config_item item;
+       int nodeid;
+       int local;
+       int addr_count;
+       struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
+};
+
+struct nodes {
+       struct config_group ns_group;
+};
+
+struct node {
+       struct config_item item;
+       struct list_head list; /* space->members */
+       int nodeid;
+       int weight;
+};
+
+static struct configfs_group_operations clusters_ops = {
+       .make_group = make_cluster,
+       .drop_item = drop_cluster,
+};
+
+static struct configfs_item_operations cluster_ops = {
+       .release = release_cluster,
+};
+
+static struct configfs_group_operations spaces_ops = {
+       .make_group = make_space,
+       .drop_item = drop_space,
+};
+
+static struct configfs_item_operations space_ops = {
+       .release = release_space,
+};
+
+static struct configfs_group_operations comms_ops = {
+       .make_item = make_comm,
+       .drop_item = drop_comm,
+};
+
+static struct configfs_item_operations comm_ops = {
+       .release = release_comm,
+       .show_attribute = show_comm,
+       .store_attribute = store_comm,
+};
+
+static struct configfs_group_operations nodes_ops = {
+       .make_item = make_node,
+       .drop_item = drop_node,
+};
+
+static struct configfs_item_operations node_ops = {
+       .release = release_node,
+       .show_attribute = show_node,
+       .store_attribute = store_node,
+};
+
+static struct config_item_type clusters_type = {
+       .ct_group_ops = &clusters_ops,
+       .ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type cluster_type = {
+       .ct_item_ops = &cluster_ops,
+       .ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type spaces_type = {
+       .ct_group_ops = &spaces_ops,
+       .ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type space_type = {
+       .ct_item_ops = &space_ops,
+       .ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type comms_type = {
+       .ct_group_ops = &comms_ops,
+       .ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type comm_type = {
+       .ct_item_ops = &comm_ops,
+       .ct_attrs = comm_attrs,
+       .ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type nodes_type = {
+       .ct_group_ops = &nodes_ops,
+       .ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type node_type = {
+       .ct_item_ops = &node_ops,
+       .ct_attrs = node_attrs,
+       .ct_owner = THIS_MODULE,
+};
+
+static struct cluster *to_cluster(struct config_item *i)
+{
+       return i ? container_of(to_config_group(i), struct cluster, group):NULL;
+}
+
+static struct space *to_space(struct config_item *i)
+{
+       return i ? container_of(to_config_group(i), struct space, group) : NULL;
+}
+
+static struct comm *to_comm(struct config_item *i)
+{
+       return i ? container_of(i, struct comm, item) : NULL;
+}
+
+static struct node *to_node(struct config_item *i)
+{
+       return i ? container_of(i, struct node, item) : NULL;
+}
+
+static struct config_group *make_cluster(struct config_group *g,
+                                        const char *name)
+{
+       struct cluster *cl = NULL;
+       struct spaces *sps = NULL;
+       struct comms *cms = NULL;
+       void *gps = NULL;
+
+       cl = kzalloc(sizeof(struct cluster), GFP_KERNEL);
+       gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
+       sps = kzalloc(sizeof(struct spaces), GFP_KERNEL);
+       cms = kzalloc(sizeof(struct comms), GFP_KERNEL);
+
+       if (!cl || !gps || !sps || !cms)
+               goto fail;
+
+       config_group_init_type_name(&cl->group, name, &cluster_type);
+       config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type);
+       config_group_init_type_name(&cms->cs_group, "comms", &comms_type);
+
+       cl->group.default_groups = gps;
+       cl->group.default_groups[0] = &sps->ss_group;
+       cl->group.default_groups[1] = &cms->cs_group;
+       cl->group.default_groups[2] = NULL;
+
+       space_list = &sps->ss_group;
+       comm_list = &cms->cs_group;
+       return &cl->group;
+
+ fail:
+       kfree(cl);
+       kfree(gps);
+       kfree(sps);
+       kfree(cms);
+       return NULL;
+}
+
+static void drop_cluster(struct config_group *g, struct config_item *i)
+{
+       struct cluster *cl = to_cluster(i);
+       struct config_item *tmp;
+       int j;
+
+       for (j = 0; cl->group.default_groups[j]; j++) {
+               tmp = &cl->group.default_groups[j]->cg_item;
+               cl->group.default_groups[j] = NULL;
+               config_item_put(tmp);
+       }
+
+       space_list = NULL;
+       comm_list = NULL;
+
+       config_item_put(i);
+}
+
+static void release_cluster(struct config_item *i)
+{
+       struct cluster *cl = to_cluster(i);
+       kfree(cl->group.default_groups);
+       kfree(cl);
+}
+
+static struct config_group *make_space(struct config_group *g, const char *name)
+{
+       struct space *sp = NULL;
+       struct nodes *nds = NULL;
+       void *gps = NULL;
+
+       sp = kzalloc(sizeof(struct space), GFP_KERNEL);
+       gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL);
+       nds = kzalloc(sizeof(struct nodes), GFP_KERNEL);
+
+       if (!sp || !gps || !nds)
+               goto fail;
+
+       config_group_init_type_name(&sp->group, name, &space_type);
+       config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type);
+
+       sp->group.default_groups = gps;
+       sp->group.default_groups[0] = &nds->ns_group;
+       sp->group.default_groups[1] = NULL;
+
+       INIT_LIST_HEAD(&sp->members);
+       init_MUTEX(&sp->members_lock);
+       sp->members_count = 0;
+       return &sp->group;
+
+ fail:
+       kfree(sp);
+       kfree(gps);
+       kfree(nds);
+       return NULL;
+}
+
+static void drop_space(struct config_group *g, struct config_item *i)
+{
+       struct space *sp = to_space(i);
+       struct config_item *tmp;
+       int j;
+
+       /* assert list_empty(&sp->members) */
+
+       for (j = 0; sp->group.default_groups[j]; j++) {
+               tmp = &sp->group.default_groups[j]->cg_item;
+               sp->group.default_groups[j] = NULL;
+               config_item_put(tmp);
+       }
+
+       config_item_put(i);
+}
+
+static void release_space(struct config_item *i)
+{
+       struct space *sp = to_space(i);
+       kfree(sp->group.default_groups);
+       kfree(sp);
+}
+
+static struct config_item *make_comm(struct config_group *g, const char *name)
+{
+       struct comm *cm;
+
+       cm = kzalloc(sizeof(struct comm), GFP_KERNEL);
+       if (!cm)
+               return NULL;
+
+       config_item_init_type_name(&cm->item, name, &comm_type);
+       cm->nodeid = -1;
+       cm->local = 0;
+       cm->addr_count = 0;
+       return &cm->item;
+}
+
+static void drop_comm(struct config_group *g, struct config_item *i)
+{
+       struct comm *cm = to_comm(i);
+       if (local_comm == cm)
+               local_comm = NULL;
+       while (cm->addr_count--)
+               kfree(cm->addr[cm->addr_count]);
+       config_item_put(i);
+}
+
+static void release_comm(struct config_item *i)
+{
+       struct comm *cm = to_comm(i);
+       kfree(cm);
+}
+
+static struct config_item *make_node(struct config_group *g, const char *name)
+{
+       struct space *sp = to_space(g->cg_item.ci_parent);
+       struct node *nd;
+
+       nd = kzalloc(sizeof(struct node), GFP_KERNEL);
+       if (!nd)
+               return NULL;
+
+       config_item_init_type_name(&nd->item, name, &node_type);
+       nd->nodeid = -1;
+       nd->weight = 1;  /* default weight of 1 if none is set */
+
+       down(&sp->members_lock);
+       list_add(&nd->list, &sp->members);
+       sp->members_count++;
+       up(&sp->members_lock);
+
+       return &nd->item;
+}
+
+static void drop_node(struct config_group *g, struct config_item *i)
+{
+       struct space *sp = to_space(g->cg_item.ci_parent);
+       struct node *nd = to_node(i);
+
+       down(&sp->members_lock);
+       list_del(&nd->list);
+       sp->members_count--;
+       up(&sp->members_lock);
+
+       config_item_put(i);
+}
+
+static void release_node(struct config_item *i)
+{
+       struct node *nd = to_node(i);
+       kfree(nd);
+}
+
+static struct clusters clusters_root = {
+       .subsys = {
+               .su_group = {
+                       .cg_item = {
+                               .ci_namebuf = "dlm",
+                               .ci_type = &clusters_type,
+                       },
+               },
+       },
+};
+
+int dlm_config_init(void)
+{
+       config_group_init(&clusters_root.subsys.su_group);
+       init_MUTEX(&clusters_root.subsys.su_sem);
+       return configfs_register_subsystem(&clusters_root.subsys);
+}
+
+void dlm_config_exit(void)
+{
+       configfs_unregister_subsystem(&clusters_root.subsys);
+}
+
+/*
+ * Functions for user space to read/write attributes
+ */
+
+static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
+                        char *buf)
+{
+       struct comm *cm = to_comm(i);
+       struct comm_attribute *cma =
+                       container_of(a, struct comm_attribute, attr);
+       return cma->show ? cma->show(cm, buf) : 0;
+}
+
+static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
+                         const char *buf, size_t len)
+{
+       struct comm *cm = to_comm(i);
+       struct comm_attribute *cma =
+               container_of(a, struct comm_attribute, attr);
+       return cma->store ? cma->store(cm, buf, len) : -EINVAL;
+}
+
+static ssize_t comm_nodeid_read(struct comm *cm, char *buf)
+{
+       return sprintf(buf, "%d\n", cm->nodeid);
+}
+
+static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len)
+{
+       cm->nodeid = simple_strtol(buf, NULL, 0);
+       return len;
+}
+
+static ssize_t comm_local_read(struct comm *cm, char *buf)
+{
+       return sprintf(buf, "%d\n", cm->local);
+}
+
+static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len)
+{
+       cm->local= simple_strtol(buf, NULL, 0);
+       if (cm->local && !local_comm)
+               local_comm = cm;
+       return len;
+}
+
+static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len)
+{
+       struct sockaddr_storage *addr;
+
+       if (len != sizeof(struct sockaddr_storage))
+               return -EINVAL;
+
+       if (cm->addr_count >= DLM_MAX_ADDR_COUNT)
+               return -ENOSPC;
+
+       addr = kzalloc(sizeof(*addr), GFP_KERNEL);
+       if (!addr)
+               return -ENOMEM;
+
+       memcpy(addr, buf, len);
+       cm->addr[cm->addr_count++] = addr;
+       return len;
+}
+
+static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
+                        char *buf)
+{
+       struct node *nd = to_node(i);
+       struct node_attribute *nda =
+                       container_of(a, struct node_attribute, attr);
+       return nda->show ? nda->show(nd, buf) : 0;
+}
+
+static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
+                         const char *buf, size_t len)
+{
+       struct node *nd = to_node(i);
+       struct node_attribute *nda =
+               container_of(a, struct node_attribute, attr);
+       return nda->store ? nda->store(nd, buf, len) : -EINVAL;
+}
+
+static ssize_t node_nodeid_read(struct node *nd, char *buf)
+{
+       return sprintf(buf, "%d\n", nd->nodeid);
+}
+
+static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len)
+{
+       nd->nodeid = simple_strtol(buf, NULL, 0);
+       return len;
+}
+
+static ssize_t node_weight_read(struct node *nd, char *buf)
+{
+       return sprintf(buf, "%d\n", nd->weight);
+}
+
+static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len)
+{
+       nd->weight = simple_strtol(buf, NULL, 0);
+       return len;
+}
+
+/*
+ * Functions for the dlm to get the info that's been configured
+ */
+
+static struct space *get_space(char *name)
+{
+       if (!space_list)
+               return NULL;
+       return to_space(config_group_find_obj(space_list, name));
+}
+
+static void put_space(struct space *sp)
+{
+       config_item_put(&sp->group.cg_item);
+}
+
+static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
+{
+       struct config_item *i;
+       struct comm *cm = NULL;
+       int found = 0;
+
+       if (!comm_list)
+               return NULL;
+
+       down(&clusters_root.subsys.su_sem);
+
+       list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
+               cm = to_comm(i);
+
+               if (nodeid) {
+                       if (cm->nodeid != nodeid)
+                               continue;
+                       found = 1;
+                       break;
+               } else {
+                       if (!cm->addr_count ||
+                           memcmp(cm->addr[0], addr, sizeof(*addr)))
+                               continue;
+                       found = 1;
+                       break;
+               }
+       }
+       up(&clusters_root.subsys.su_sem);
+
+       if (found)
+               config_item_get(i);
+       else
+               cm = NULL;
+       return cm;
+}
+
+static void put_comm(struct comm *cm)
+{
+       config_item_put(&cm->item);
+}
+
+/* caller must free mem */
+int dlm_nodeid_list(char *lsname, int **ids_out)
+{
+       struct space *sp;
+       struct node *nd;
+       int i = 0, rv = 0;
+       int *ids;
+
+       sp = get_space(lsname);
+       if (!sp)
+               return -EEXIST;
+
+       down(&sp->members_lock);
+       if (!sp->members_count) {
+               rv = 0;
+               goto out;
+       }
+
+       ids = kcalloc(sp->members_count, sizeof(int), GFP_KERNEL);
+       if (!ids) {
+               rv = -ENOMEM;
+               goto out;
+       }
+
+       rv = sp->members_count;
+       list_for_each_entry(nd, &sp->members, list)
+               ids[i++] = nd->nodeid;
+
+       if (rv != i)
+               printk("bad nodeid count %d %d\n", rv, i);
+
+       *ids_out = ids;
+ out:
+       up(&sp->members_lock);
+       put_space(sp);
+       return rv;
+}
+
+int dlm_node_weight(char *lsname, int nodeid)
+{
+       struct space *sp;
+       struct node *nd;
+       int w = -EEXIST;
+
+       sp = get_space(lsname);
+       if (!sp)
+               goto out;
+
+       down(&sp->members_lock);
+       list_for_each_entry(nd, &sp->members, list) {
+               if (nd->nodeid != nodeid)
+                       continue;
+               w = nd->weight;
+               break;
+       }
+       up(&sp->members_lock);
+       put_space(sp);
+ out:
+       return w;
+}
+
+int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
+{
+       struct comm *cm = get_comm(nodeid, NULL);
+       if (!cm)
+               return -EEXIST;
+       if (!cm->addr_count)
+               return -ENOENT;
+       memcpy(addr, cm->addr[0], sizeof(*addr));
+       put_comm(cm);
+       return 0;
+}
+
+int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
+{
+       struct comm *cm = get_comm(0, addr);
+       if (!cm)
+               return -EEXIST;
+       *nodeid = cm->nodeid;
+       put_comm(cm);
+       return 0;
+}
+
+int dlm_our_nodeid(void)
+{
+       return local_comm ? local_comm->nodeid : 0;
+}
+
+/* num 0 is first addr, num 1 is second addr */
+int dlm_our_addr(struct sockaddr_storage *addr, int num)
+{
+       if (!local_comm)
+               return -1;
+       if (num + 1 > local_comm->addr_count)
+               return -1;
+       memcpy(addr, local_comm->addr[num], sizeof(*addr));
+       return 0;
+}
+
+/* Config file defaults */
+#define DEFAULT_TCP_PORT       21064
+#define DEFAULT_BUFFER_SIZE     4096
+#define DEFAULT_RSBTBL_SIZE      256
+#define DEFAULT_LKBTBL_SIZE     1024
+#define DEFAULT_DIRTBL_SIZE      512
+#define DEFAULT_RECOVER_TIMER      5
+#define DEFAULT_TOSS_SECS         10
+#define DEFAULT_SCAN_SECS          5
+
+struct dlm_config_info dlm_config = {
+       .tcp_port = DEFAULT_TCP_PORT,
+       .buffer_size = DEFAULT_BUFFER_SIZE,
+       .rsbtbl_size = DEFAULT_RSBTBL_SIZE,
+       .lkbtbl_size = DEFAULT_LKBTBL_SIZE,
+       .dirtbl_size = DEFAULT_DIRTBL_SIZE,
+       .recover_timer = DEFAULT_RECOVER_TIMER,
+       .toss_secs = DEFAULT_TOSS_SECS,
+       .scan_secs = DEFAULT_SCAN_SECS
+};
+
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
new file mode 100644 (file)
index 0000000..9da7839
--- /dev/null
@@ -0,0 +1,42 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __CONFIG_DOT_H__
+#define __CONFIG_DOT_H__
+
+#define DLM_MAX_ADDR_COUNT 3
+
+struct dlm_config_info {
+       int tcp_port;
+       int buffer_size;
+       int rsbtbl_size;
+       int lkbtbl_size;
+       int dirtbl_size;
+       int recover_timer;
+       int toss_secs;
+       int scan_secs;
+};
+
+extern struct dlm_config_info dlm_config;
+
+int dlm_config_init(void);
+void dlm_config_exit(void);
+int dlm_node_weight(char *lsname, int nodeid);
+int dlm_nodeid_list(char *lsname, int **ids_out);
+int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr);
+int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid);
+int dlm_our_nodeid(void);
+int dlm_our_addr(struct sockaddr_storage *addr, int num);
+
+#endif                         /* __CONFIG_DOT_H__ */
+
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
new file mode 100644 (file)
index 0000000..98b49a1
--- /dev/null
@@ -0,0 +1,310 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+
+#include "dlm_internal.h"
+
+
+static struct dentry *dlm_root;
+
+struct rsb_iter {
+       int entry;
+       struct dlm_ls *ls;
+       struct list_head *next;
+       struct dlm_rsb *rsb;
+};
+
+static char *print_lockmode(int mode)
+{
+       switch (mode) {
+       case DLM_LOCK_IV:
+               return "--";
+       case DLM_LOCK_NL:
+               return "NL";
+       case DLM_LOCK_CR:
+               return "CR";
+       case DLM_LOCK_CW:
+               return "CW";
+       case DLM_LOCK_PR:
+               return "PR";
+       case DLM_LOCK_PW:
+               return "PW";
+       case DLM_LOCK_EX:
+               return "EX";
+       default:
+               return "??";
+       }
+}
+
+static void print_lock(struct seq_file *s, struct dlm_lkb *lkb,
+                      struct dlm_rsb *res)
+{
+       seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
+
+       if (lkb->lkb_status == DLM_LKSTS_CONVERT
+           || lkb->lkb_status == DLM_LKSTS_WAITING)
+               seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
+
+       if (lkb->lkb_range) {
+               /* FIXME: this warns on Alpha */
+               if (lkb->lkb_status == DLM_LKSTS_CONVERT
+                   || lkb->lkb_status == DLM_LKSTS_GRANTED)
+                       seq_printf(s, " %" PRIx64 "-%" PRIx64,
+                                  lkb->lkb_range[GR_RANGE_START],
+                                  lkb->lkb_range[GR_RANGE_END]);
+               if (lkb->lkb_status == DLM_LKSTS_CONVERT
+                   || lkb->lkb_status == DLM_LKSTS_WAITING)
+                       seq_printf(s, " (%" PRIx64 "-%" PRIx64 ")",
+                                  lkb->lkb_range[RQ_RANGE_START],
+                                  lkb->lkb_range[RQ_RANGE_END]);
+       }
+
+       if (lkb->lkb_nodeid) {
+               if (lkb->lkb_nodeid != res->res_nodeid)
+                       seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid,
+                                  lkb->lkb_remid);
+               else
+                       seq_printf(s, " Master:     %08x", lkb->lkb_remid);
+       }
+
+       if (lkb->lkb_wait_type)
+               seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
+
+       seq_printf(s, "\n");
+}
+
+static int print_resource(struct dlm_rsb *res, struct seq_file *s)
+{
+       struct dlm_lkb *lkb;
+       int i, lvblen = res->res_ls->ls_lvblen;
+
+       seq_printf(s, "\nResource %p Name (len=%d) \"", res, res->res_length);
+       for (i = 0; i < res->res_length; i++) {
+               if (isprint(res->res_name[i]))
+                       seq_printf(s, "%c", res->res_name[i]);
+               else
+                       seq_printf(s, "%c", '.');
+       }
+       if (res->res_nodeid > 0)
+               seq_printf(s, "\"  \nLocal Copy, Master is node %d\n",
+                          res->res_nodeid);
+       else if (res->res_nodeid == 0)
+               seq_printf(s, "\"  \nMaster Copy\n");
+       else if (res->res_nodeid == -1)
+               seq_printf(s, "\"  \nLooking up master (lkid %x)\n",
+                          res->res_first_lkid);
+       else
+               seq_printf(s, "\"  \nInvalid master %d\n", res->res_nodeid);
+
+       /* Print the LVB: */
+       if (res->res_lvbptr) {
+               seq_printf(s, "LVB: ");
+               for (i = 0; i < lvblen; i++) {
+                       if (i == lvblen / 2)
+                               seq_printf(s, "\n     ");
+                       seq_printf(s, "%02x ",
+                                  (unsigned char) res->res_lvbptr[i]);
+               }
+               if (rsb_flag(res, RSB_VALNOTVALID))
+                       seq_printf(s, " (INVALID)");
+               seq_printf(s, "\n");
+       }
+
+       /* Print the locks attached to this resource */
+       seq_printf(s, "Granted Queue\n");
+       list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue)
+               print_lock(s, lkb, res);
+
+       seq_printf(s, "Conversion Queue\n");
+       list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue)
+               print_lock(s, lkb, res);
+
+       seq_printf(s, "Waiting Queue\n");
+       list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue)
+               print_lock(s, lkb, res);
+
+       return 0;
+}
+
+static int rsb_iter_next(struct rsb_iter *ri)
+{
+       struct dlm_ls *ls = ri->ls;
+       int i;
+
+       if (!ri->next) {
+ top:
+               /* Find the next non-empty hash bucket */
+               for (i = ri->entry; i < ls->ls_rsbtbl_size; i++) {
+                       read_lock(&ls->ls_rsbtbl[i].lock);
+                       if (!list_empty(&ls->ls_rsbtbl[i].list)) {
+                               ri->next = ls->ls_rsbtbl[i].list.next;
+                               read_unlock(&ls->ls_rsbtbl[i].lock);
+                               break;
+                       }
+                       read_unlock(&ls->ls_rsbtbl[i].lock);
+                }
+               ri->entry = i;
+
+               if (ri->entry >= ls->ls_rsbtbl_size)
+                       return 1;
+       } else {
+               i = ri->entry;
+               read_lock(&ls->ls_rsbtbl[i].lock);
+               ri->next = ri->next->next;
+               if (ri->next->next == ls->ls_rsbtbl[i].list.next) {
+                       /* End of list - move to next bucket */
+                       ri->next = NULL;
+                       ri->entry++;
+                       read_unlock(&ls->ls_rsbtbl[i].lock);
+                       goto top;
+                }
+               read_unlock(&ls->ls_rsbtbl[i].lock);
+       }
+       ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain);
+
+       return 0;
+}
+
+static void rsb_iter_free(struct rsb_iter *ri)
+{
+       kfree(ri);
+}
+
+static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls)
+{
+       struct rsb_iter *ri;
+
+       ri = kmalloc(sizeof *ri, GFP_KERNEL);
+       if (!ri)
+               return NULL;
+
+       ri->ls = ls;
+       ri->entry = 0;
+       ri->next = NULL;
+
+       if (rsb_iter_next(ri)) {
+               rsb_iter_free(ri);
+               return NULL;
+       }
+
+       return ri;
+}
+
+static void *seq_start(struct seq_file *file, loff_t *pos)
+{
+       struct rsb_iter *ri;
+       loff_t n = *pos;
+
+       ri = rsb_iter_init(file->private);
+       if (!ri)
+               return NULL;
+
+       while (n--) {
+               if (rsb_iter_next(ri)) {
+                       rsb_iter_free(ri);
+                       return NULL;
+               }
+       }
+
+       return ri;
+}
+
+static void *seq_next(struct seq_file *file, void *iter_ptr, loff_t *pos)
+{
+       struct rsb_iter *ri = iter_ptr;
+
+       (*pos)++;
+
+       if (rsb_iter_next(ri)) {
+               rsb_iter_free(ri);
+               return NULL;
+       }
+
+       return ri;
+}
+
+static void seq_stop(struct seq_file *file, void *iter_ptr)
+{
+       /* nothing for now */
+}
+
+static int seq_show(struct seq_file *file, void *iter_ptr)
+{
+       struct rsb_iter *ri = iter_ptr;
+
+       print_resource(ri->rsb, file);
+
+       return 0;
+}
+
+static struct seq_operations dlm_seq_ops = {
+       .start = seq_start,
+       .next  = seq_next,
+       .stop  = seq_stop,
+       .show  = seq_show,
+};
+
+static int do_open(struct inode *inode, struct file *file)
+{
+       struct seq_file *seq;
+       int ret;
+
+       ret = seq_open(file, &dlm_seq_ops);
+       if (ret)
+               return ret;
+
+       seq = file->private_data;
+       seq->private = inode->u.generic_ip;
+
+       return 0;
+}
+
+static struct file_operations dlm_fops = {
+       .owner   = THIS_MODULE,
+       .open    = do_open,
+       .read    = seq_read,
+       .llseek  = seq_lseek,
+       .release = seq_release
+};
+
+int dlm_create_debug_file(struct dlm_ls *ls)
+{
+       ls->ls_debug_dentry = debugfs_create_file(ls->ls_name,
+                                                 S_IFREG | S_IRUGO,
+                                                 dlm_root,
+                                                 ls,
+                                                 &dlm_fops);
+       return ls->ls_debug_dentry ? 0 : -ENOMEM;
+}
+
+void dlm_delete_debug_file(struct dlm_ls *ls)
+{
+       if (ls->ls_debug_dentry)
+               debugfs_remove(ls->ls_debug_dentry);
+}
+
+int dlm_register_debugfs(void)
+{
+       dlm_root = debugfs_create_dir("dlm", NULL);
+       return dlm_root ? 0 : -ENOMEM;
+}
+
+void dlm_unregister_debugfs(void)
+{
+       debugfs_remove(dlm_root);
+}
+
diff --git a/fs/dlm/device.c b/fs/dlm/device.c
new file mode 100644 (file)
index 0000000..a8bf600
--- /dev/null
@@ -0,0 +1,1084 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/*
+ * device.c
+ *
+ * This is the userland interface to the DLM.
+ *
+ * The locking is done via a misc char device (find the
+ * registered minor number in /proc/misc).
+ *
+ * User code should not use this interface directly but
+ * call the library routines in libdlm.a instead.
+ *
+ */
+
+#include <linux/miscdevice.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+#include <linux/module.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/signal.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+
+#include <linux/dlm.h>
+#include <linux/dlm_device.h>
+
+#include "lvb_table.h"
+
+static struct file_operations _dlm_fops;
+static const char *name_prefix="dlm";
+static struct list_head user_ls_list;
+static struct semaphore user_ls_lock;
+
+/* Lock infos are stored in here indexed by lock ID */
+static DEFINE_IDR(lockinfo_idr);
+static rwlock_t lockinfo_lock;
+
+/* Flags in li_flags */
+#define LI_FLAG_COMPLETE   1
+#define LI_FLAG_FIRSTLOCK  2
+#define LI_FLAG_PERSISTENT 3
+
+/* flags in ls_flags*/
+#define LS_FLAG_DELETED   1
+#define LS_FLAG_AUTOFREE  2
+
+
+#define LOCKINFO_MAGIC 0x53595324
+
+struct lock_info {
+       uint32_t li_magic;
+       uint8_t li_cmd;
+       int8_t  li_grmode;
+       int8_t  li_rqmode;
+       struct dlm_lksb li_lksb;
+       wait_queue_head_t li_waitq;
+       unsigned long li_flags;
+       void __user *li_castparam;
+       void __user *li_castaddr;
+       void __user *li_bastparam;
+       void __user *li_bastaddr;
+       void __user *li_pend_bastparam;
+       void __user *li_pend_bastaddr;
+       struct list_head li_ownerqueue;
+       struct file_info *li_file;
+       struct dlm_lksb __user *li_user_lksb;
+       struct semaphore li_firstlock;
+};
+
+/* A queued AST no less */
+struct ast_info {
+       struct dlm_lock_result result;
+       struct list_head list;
+       uint32_t lvb_updated;
+       uint32_t progress;      /* How much has been read */
+};
+
+/* One of these per userland lockspace */
+struct user_ls {
+       void    *ls_lockspace;
+       atomic_t ls_refcnt;
+       long     ls_flags;
+
+       /* Passed into misc_register() */
+       struct miscdevice ls_miscinfo;
+       struct list_head  ls_list;
+};
+
+/* misc_device info for the control device */
+static struct miscdevice ctl_device;
+
+/*
+ * Stuff we hang off the file struct.
+ * The first two are to cope with unlocking all the
+ * locks help by a process when it dies.
+ */
+struct file_info {
+       struct list_head    fi_li_list;  /* List of active lock_infos */
+       spinlock_t          fi_li_lock;
+       struct list_head    fi_ast_list; /* Queue of ASTs to be delivered */
+       spinlock_t          fi_ast_lock;
+       wait_queue_head_t   fi_wait;
+       struct user_ls     *fi_ls;
+       atomic_t            fi_refcnt;   /* Number of users */
+       unsigned long       fi_flags;    /* Bit 1 means the device is open */
+};
+
+
+/* get and put ops for file_info.
+   Actually I don't really like "get" and "put", but everyone
+   else seems to use them and I can't think of anything
+   nicer at the moment */
+static void get_file_info(struct file_info *f)
+{
+       atomic_inc(&f->fi_refcnt);
+}
+
+static void put_file_info(struct file_info *f)
+{
+       if (atomic_dec_and_test(&f->fi_refcnt))
+               kfree(f);
+}
+
+static void release_lockinfo(struct lock_info *li)
+{
+       put_file_info(li->li_file);
+
+       write_lock(&lockinfo_lock);
+       idr_remove(&lockinfo_idr, li->li_lksb.sb_lkid);
+       write_unlock(&lockinfo_lock);
+
+       if (li->li_lksb.sb_lvbptr)
+               kfree(li->li_lksb.sb_lvbptr);
+       kfree(li);
+
+       module_put(THIS_MODULE);
+}
+
+static struct lock_info *get_lockinfo(uint32_t lockid)
+{
+       struct lock_info *li;
+
+       read_lock(&lockinfo_lock);
+       li = idr_find(&lockinfo_idr, lockid);
+       read_unlock(&lockinfo_lock);
+
+       return li;
+}
+
+static int add_lockinfo(struct lock_info *li)
+{
+       int n;
+       int r;
+       int ret = -EINVAL;
+
+       write_lock(&lockinfo_lock);
+
+       if (idr_find(&lockinfo_idr, li->li_lksb.sb_lkid))
+               goto out_up;
+
+       ret = -ENOMEM;
+       r = idr_pre_get(&lockinfo_idr, GFP_KERNEL);
+       if (!r)
+               goto out_up;
+
+       r = idr_get_new_above(&lockinfo_idr, li, li->li_lksb.sb_lkid, &n);
+       if (r)
+               goto out_up;
+
+       if (n != li->li_lksb.sb_lkid) {
+               idr_remove(&lockinfo_idr, n);
+               goto out_up;
+       }
+
+       ret = 0;
+
+ out_up:
+       write_unlock(&lockinfo_lock);
+
+       return ret;
+}
+
+
+static struct user_ls *__find_lockspace(int minor)
+{
+       struct user_ls *lsinfo;
+
+       list_for_each_entry(lsinfo, &user_ls_list, ls_list) {
+               if (lsinfo->ls_miscinfo.minor == minor)
+                       return lsinfo;
+       }
+       return NULL;
+}
+
+/* Find a lockspace struct given the device minor number */
+static struct user_ls *find_lockspace(int minor)
+{
+       struct user_ls *lsinfo;
+
+       down(&user_ls_lock);
+       lsinfo = __find_lockspace(minor);
+       up(&user_ls_lock);
+
+       return lsinfo;
+}
+
+static void add_lockspace_to_list(struct user_ls *lsinfo)
+{
+       down(&user_ls_lock);
+       list_add(&lsinfo->ls_list, &user_ls_list);
+       up(&user_ls_lock);
+}
+
+/* Register a lockspace with the DLM and create a misc
+   device for userland to access it */
+static int register_lockspace(char *name, struct user_ls **ls, int flags)
+{
+       struct user_ls *newls;
+       int status;
+       int namelen;
+
+       namelen = strlen(name)+strlen(name_prefix)+2;
+
+       newls = kmalloc(sizeof(struct user_ls), GFP_KERNEL);
+       if (!newls)
+               return -ENOMEM;
+       memset(newls, 0, sizeof(struct user_ls));
+
+       newls->ls_miscinfo.name = kmalloc(namelen, GFP_KERNEL);
+       if (!newls->ls_miscinfo.name) {
+               kfree(newls);
+               return -ENOMEM;
+       }
+
+       status = dlm_new_lockspace(name, strlen(name), &newls->ls_lockspace, 0,
+                                  DLM_USER_LVB_LEN);
+       if (status != 0) {
+               kfree(newls->ls_miscinfo.name);
+               kfree(newls);
+               return status;
+       }
+
+       snprintf((char*)newls->ls_miscinfo.name, namelen, "%s_%s",
+                name_prefix, name);
+
+       newls->ls_miscinfo.fops = &_dlm_fops;
+       newls->ls_miscinfo.minor = MISC_DYNAMIC_MINOR;
+
+       status = misc_register(&newls->ls_miscinfo);
+       if (status) {
+               printk(KERN_ERR "dlm: misc register failed for %s\n", name);
+               dlm_release_lockspace(newls->ls_lockspace, 0);
+               kfree(newls->ls_miscinfo.name);
+               kfree(newls);
+               return status;
+       }
+
+       if (flags & DLM_USER_LSFLG_AUTOFREE)
+               set_bit(LS_FLAG_AUTOFREE, &newls->ls_flags);
+
+       add_lockspace_to_list(newls);
+       *ls = newls;
+       return 0;
+}
+
+/* Called with the user_ls_lock semaphore held */
+static int unregister_lockspace(struct user_ls *lsinfo, int force)
+{
+       int status;
+
+       status = dlm_release_lockspace(lsinfo->ls_lockspace, force);
+       if (status)
+               return status;
+
+       status = misc_deregister(&lsinfo->ls_miscinfo);
+       if (status)
+               return status;
+
+       list_del(&lsinfo->ls_list);
+       set_bit(LS_FLAG_DELETED, &lsinfo->ls_flags);
+       lsinfo->ls_lockspace = NULL;
+       if (atomic_read(&lsinfo->ls_refcnt) == 0) {
+               kfree(lsinfo->ls_miscinfo.name);
+               kfree(lsinfo);
+       }
+
+       return 0;
+}
+
+/* Add it to userland's AST queue */
+static void add_to_astqueue(struct lock_info *li, void *astaddr, void *astparam,
+                           int lvb_updated)
+{
+       struct ast_info *ast = kmalloc(sizeof(struct ast_info), GFP_KERNEL);
+       if (!ast)
+               return;
+
+       memset(ast, 0, sizeof(*ast));
+       ast->result.user_astparam = astparam;
+       ast->result.user_astaddr  = astaddr;
+       ast->result.user_lksb     = li->li_user_lksb;
+       memcpy(&ast->result.lksb, &li->li_lksb, sizeof(struct dlm_lksb));
+       ast->lvb_updated = lvb_updated;
+
+       spin_lock(&li->li_file->fi_ast_lock);
+       list_add_tail(&ast->list, &li->li_file->fi_ast_list);
+       spin_unlock(&li->li_file->fi_ast_lock);
+       wake_up_interruptible(&li->li_file->fi_wait);
+}
+
+static void bast_routine(void *param, int mode)
+{
+       struct lock_info *li = param;
+
+       if (li && li->li_bastaddr)
+               add_to_astqueue(li, li->li_bastaddr, li->li_bastparam, 0);
+}
+
+/*
+ * This is the kernel's AST routine.
+ * All lock, unlock & query operations complete here.
+ * The only syncronous ops are those done during device close.
+ */
+static void ast_routine(void *param)
+{
+       struct lock_info *li = param;
+
+       /* Param may be NULL if a persistent lock is unlocked by someone else */
+       if (!li)
+               return;
+
+       /* If this is a succesful conversion then activate the blocking ast
+        * args from the conversion request */
+       if (!test_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) &&
+           li->li_lksb.sb_status == 0) {
+
+               li->li_bastparam = li->li_pend_bastparam;
+               li->li_bastaddr = li->li_pend_bastaddr;
+               li->li_pend_bastaddr = NULL;
+       }
+
+       /* If it's an async request then post data to the user's AST queue. */
+       if (li->li_castaddr) {
+               int lvb_updated = 0;
+
+               /* See if the lvb has been updated */
+               if (dlm_lvb_operations[li->li_grmode+1][li->li_rqmode+1] == 1)
+                       lvb_updated = 1;
+
+               if (li->li_lksb.sb_status == 0)
+                       li->li_grmode = li->li_rqmode;
+
+               /* Only queue AST if the device is still open */
+               if (test_bit(1, &li->li_file->fi_flags))
+                       add_to_astqueue(li, li->li_castaddr, li->li_castparam,
+                                       lvb_updated);
+
+               /* If it's a new lock operation that failed, then
+                * remove it from the owner queue and free the
+                * lock_info.
+                */
+               if (test_and_clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) &&
+                   li->li_lksb.sb_status != 0) {
+
+                       /* Wait till dlm_lock() has finished */
+                       down(&li->li_firstlock);
+                       up(&li->li_firstlock);
+
+                       spin_lock(&li->li_file->fi_li_lock);
+                       list_del(&li->li_ownerqueue);
+                       spin_unlock(&li->li_file->fi_li_lock);
+                       release_lockinfo(li);
+                       return;
+               }
+               /* Free unlocks & queries */
+               if (li->li_lksb.sb_status == -DLM_EUNLOCK ||
+                   li->li_cmd == DLM_USER_QUERY) {
+                       release_lockinfo(li);
+               }
+       } else {
+               /* Synchronous request, just wake up the caller */
+               set_bit(LI_FLAG_COMPLETE, &li->li_flags);
+               wake_up_interruptible(&li->li_waitq);
+       }
+}
+
+/*
+ * Wait for the lock op to complete and return the status.
+ */
+static int wait_for_ast(struct lock_info *li)
+{
+       /* Wait for the AST routine to complete */
+       set_task_state(current, TASK_INTERRUPTIBLE);
+       while (!test_bit(LI_FLAG_COMPLETE, &li->li_flags))
+               schedule();
+
+       set_task_state(current, TASK_RUNNING);
+
+       return li->li_lksb.sb_status;
+}
+
+
+/* Open on control device */
+static int dlm_ctl_open(struct inode *inode, struct file *file)
+{
+       file->private_data = NULL;
+       return 0;
+}
+
+/* Close on control device */
+static int dlm_ctl_close(struct inode *inode, struct file *file)
+{
+       return 0;
+}
+
+/* Open on lockspace device */
+static int dlm_open(struct inode *inode, struct file *file)
+{
+       struct file_info *f;
+       struct user_ls *lsinfo;
+
+       lsinfo = find_lockspace(iminor(inode));
+       if (!lsinfo)
+               return -ENOENT;
+
+       f = kmalloc(sizeof(struct file_info), GFP_KERNEL);
+       if (!f)
+               return -ENOMEM;
+
+       atomic_inc(&lsinfo->ls_refcnt);
+       INIT_LIST_HEAD(&f->fi_li_list);
+       INIT_LIST_HEAD(&f->fi_ast_list);
+       spin_lock_init(&f->fi_li_lock);
+       spin_lock_init(&f->fi_ast_lock);
+       init_waitqueue_head(&f->fi_wait);
+       f->fi_ls = lsinfo;
+       f->fi_flags = 0;
+       get_file_info(f);
+       set_bit(1, &f->fi_flags);
+
+       file->private_data = f;
+
+       return 0;
+}
+
+/* Check the user's version matches ours */
+static int check_version(struct dlm_write_request *req)
+{
+       if (req->version[0] != DLM_DEVICE_VERSION_MAJOR ||
+           (req->version[0] == DLM_DEVICE_VERSION_MAJOR &&
+            req->version[1] > DLM_DEVICE_VERSION_MINOR)) {
+
+               printk(KERN_DEBUG "dlm: process %s (%d) version mismatch "
+                      "user (%d.%d.%d) kernel (%d.%d.%d)\n",
+                      current->comm,
+                      current->pid,
+                      req->version[0],
+                      req->version[1],
+                      req->version[2],
+                      DLM_DEVICE_VERSION_MAJOR,
+                      DLM_DEVICE_VERSION_MINOR,
+                      DLM_DEVICE_VERSION_PATCH);
+               return -EINVAL;
+       }
+       return 0;
+}
+
+/* Close on lockspace device */
+static int dlm_close(struct inode *inode, struct file *file)
+{
+       struct file_info *f = file->private_data;
+       struct lock_info li;
+       struct lock_info *old_li, *safe;
+       sigset_t tmpsig;
+       sigset_t allsigs;
+       struct user_ls *lsinfo;
+       DECLARE_WAITQUEUE(wq, current);
+
+       lsinfo = find_lockspace(iminor(inode));
+       if (!lsinfo)
+               return -ENOENT;
+
+       /* Mark this closed so that ASTs will not be delivered any more */
+       clear_bit(1, &f->fi_flags);
+
+       /* Block signals while we are doing this */
+       sigfillset(&allsigs);
+       sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
+
+       /* We use our own lock_info struct here, so that any
+        * outstanding "real" ASTs will be delivered with the
+        * corresponding "real" params, thus freeing the lock_info
+        * that belongs the lock. This catches the corner case where
+        * a lock is BUSY when we try to unlock it here
+        */
+       memset(&li, 0, sizeof(li));
+       clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
+       init_waitqueue_head(&li.li_waitq);
+       add_wait_queue(&li.li_waitq, &wq);
+
+       /*
+        * Free any outstanding locks, they are on the
+        * list in LIFO order so there should be no problems
+        * about unlocking parents before children.
+        */
+       list_for_each_entry_safe(old_li, safe, &f->fi_li_list, li_ownerqueue) {
+               int status;
+               int flags = 0;
+
+               /* Don't unlock persistent locks, just mark them orphaned */
+               if (test_bit(LI_FLAG_PERSISTENT, &old_li->li_flags)) {
+                       list_del(&old_li->li_ownerqueue);
+
+                       /* Update master copy */
+                       /* TODO: Check locking core updates the local and
+                          remote ORPHAN flags */
+                       li.li_lksb.sb_lkid = old_li->li_lksb.sb_lkid;
+                       status = dlm_lock(f->fi_ls->ls_lockspace,
+                                         old_li->li_grmode, &li.li_lksb,
+                                         DLM_LKF_CONVERT|DLM_LKF_ORPHAN,
+                                         NULL, 0, 0, ast_routine, NULL,
+                                         NULL, NULL);
+                       if (status != 0)
+                               printk("dlm: Error orphaning lock %x: %d\n",
+                                      old_li->li_lksb.sb_lkid, status);
+
+                       /* But tidy our references in it */
+                       release_lockinfo(old_li);
+                       continue;
+               }
+
+               clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
+
+               flags = DLM_LKF_FORCEUNLOCK;
+               if (old_li->li_grmode >= DLM_LOCK_PW)
+                       flags |= DLM_LKF_IVVALBLK;
+
+               status = dlm_unlock(f->fi_ls->ls_lockspace,
+                                   old_li->li_lksb.sb_lkid, flags,
+                                   &li.li_lksb, &li);
+
+               /* Must wait for it to complete as the next lock could be its
+                * parent */
+               if (status == 0)
+                       wait_for_ast(&li);
+
+               /* Unlock suceeded, free the lock_info struct. */
+               if (status == 0)
+                       release_lockinfo(old_li);
+       }
+
+       remove_wait_queue(&li.li_waitq, &wq);
+
+       /*
+        * If this is the last reference to the lockspace
+        * then free the struct. If it's an AUTOFREE lockspace
+        * then free the whole thing.
+        */
+       down(&user_ls_lock);
+       if (atomic_dec_and_test(&lsinfo->ls_refcnt)) {
+
+               if (lsinfo->ls_lockspace) {
+                       if (test_bit(LS_FLAG_AUTOFREE, &lsinfo->ls_flags)) {
+                               unregister_lockspace(lsinfo, 1);
+                       }
+               } else {
+                       kfree(lsinfo->ls_miscinfo.name);
+                       kfree(lsinfo);
+               }
+       }
+       up(&user_ls_lock);
+       put_file_info(f);
+
+       /* Restore signals */
+       sigprocmask(SIG_SETMASK, &tmpsig, NULL);
+       recalc_sigpending();
+
+       return 0;
+}
+
+static int do_user_create_lockspace(struct file_info *fi, uint8_t cmd,
+                                   struct dlm_lspace_params *kparams)
+{
+       int status;
+       struct user_ls *lsinfo;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       status = register_lockspace(kparams->name, &lsinfo, kparams->flags);
+
+       /* If it succeeded then return the minor number */
+       if (status == 0)
+               status = lsinfo->ls_miscinfo.minor;
+
+       return status;
+}
+
+static int do_user_remove_lockspace(struct file_info *fi, uint8_t cmd,
+                                   struct dlm_lspace_params *kparams)
+{
+       int status;
+       int force = 1;
+       struct user_ls *lsinfo;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       down(&user_ls_lock);
+       lsinfo = __find_lockspace(kparams->minor);
+       if (!lsinfo) {
+               up(&user_ls_lock);
+               return -EINVAL;
+       }
+
+       if (kparams->flags & DLM_USER_LSFLG_FORCEFREE)
+               force = 2;
+
+       status = unregister_lockspace(lsinfo, force);
+       up(&user_ls_lock);
+
+       return status;
+}
+
+/* Read call, might block if no ASTs are waiting.
+ * It will only ever return one message at a time, regardless
+ * of how many are pending.
+ */
+static ssize_t dlm_read(struct file *file, char __user *buffer, size_t count,
+                       loff_t *ppos)
+{
+       struct file_info *fi = file->private_data;
+       struct ast_info *ast;
+       int data_size;
+       int offset;
+       DECLARE_WAITQUEUE(wait, current);
+
+       if (count < sizeof(struct dlm_lock_result))
+               return -EINVAL;
+
+       spin_lock(&fi->fi_ast_lock);
+       if (list_empty(&fi->fi_ast_list)) {
+
+               /* No waiting ASTs.
+                * Return EOF if the lockspace been deleted.
+                */
+               if (test_bit(LS_FLAG_DELETED, &fi->fi_ls->ls_flags))
+                       return 0;
+
+               if (file->f_flags & O_NONBLOCK) {
+                       spin_unlock(&fi->fi_ast_lock);
+                       return -EAGAIN;
+               }
+
+               add_wait_queue(&fi->fi_wait, &wait);
+
+       repeat:
+               set_current_state(TASK_INTERRUPTIBLE);
+               if (list_empty(&fi->fi_ast_list) &&
+                   !signal_pending(current)) {
+
+                       spin_unlock(&fi->fi_ast_lock);
+                       schedule();
+                       spin_lock(&fi->fi_ast_lock);
+                       goto repeat;
+               }
+
+               current->state = TASK_RUNNING;
+               remove_wait_queue(&fi->fi_wait, &wait);
+
+               if (signal_pending(current)) {
+                       spin_unlock(&fi->fi_ast_lock);
+                       return -ERESTARTSYS;
+               }
+       }
+
+       ast = list_entry(fi->fi_ast_list.next, struct ast_info, list);
+       list_del(&ast->list);
+       spin_unlock(&fi->fi_ast_lock);
+
+       /* Work out the size of the returned data */
+       data_size = sizeof(struct dlm_lock_result);
+       if (ast->lvb_updated && ast->result.lksb.sb_lvbptr)
+               data_size += DLM_USER_LVB_LEN;
+
+       offset = sizeof(struct dlm_lock_result);
+
+       /* Room for the extended data ? */
+       if (count >= data_size) {
+
+               if (ast->lvb_updated && ast->result.lksb.sb_lvbptr) {
+                       if (copy_to_user(buffer+offset,
+                                        ast->result.lksb.sb_lvbptr,
+                                        DLM_USER_LVB_LEN))
+                               return -EFAULT;
+                       ast->result.lvb_offset = offset;
+                       offset += DLM_USER_LVB_LEN;
+               }
+       }
+
+       ast->result.length = data_size;
+       /* Copy the header now it has all the offsets in it */
+       if (copy_to_user(buffer, &ast->result, sizeof(struct dlm_lock_result)))
+               offset = -EFAULT;
+
+       /* If we only returned a header and there's more to come then put it
+          back on the list */
+       if (count < data_size) {
+               spin_lock(&fi->fi_ast_lock);
+               list_add(&ast->list, &fi->fi_ast_list);
+               spin_unlock(&fi->fi_ast_lock);
+       } else
+               kfree(ast);
+       return offset;
+}
+
+static unsigned int dlm_poll(struct file *file, poll_table *wait)
+{
+       struct file_info *fi = file->private_data;
+
+       poll_wait(file, &fi->fi_wait, wait);
+
+       spin_lock(&fi->fi_ast_lock);
+       if (!list_empty(&fi->fi_ast_list)) {
+               spin_unlock(&fi->fi_ast_lock);
+               return POLLIN | POLLRDNORM;
+       }
+
+       spin_unlock(&fi->fi_ast_lock);
+       return 0;
+}
+
+static struct lock_info *allocate_lockinfo(struct file_info *fi, uint8_t cmd,
+                                          struct dlm_lock_params *kparams)
+{
+       struct lock_info *li;
+
+       if (!try_module_get(THIS_MODULE))
+               return NULL;
+
+       li = kmalloc(sizeof(struct lock_info), GFP_KERNEL);
+       if (li) {
+               li->li_magic     = LOCKINFO_MAGIC;
+               li->li_file      = fi;
+               li->li_cmd       = cmd;
+               li->li_flags     = 0;
+               li->li_grmode    = -1;
+               li->li_rqmode    = -1;
+               li->li_pend_bastparam = NULL;
+               li->li_pend_bastaddr  = NULL;
+               li->li_castaddr   = NULL;
+               li->li_castparam  = NULL;
+               li->li_lksb.sb_lvbptr = NULL;
+               li->li_bastaddr  = kparams->bastaddr;
+               li->li_bastparam = kparams->bastparam;
+
+               get_file_info(fi);
+       }
+       return li;
+}
+
+static int do_user_lock(struct file_info *fi, uint8_t cmd,
+                       struct dlm_lock_params *kparams)
+{
+       struct lock_info *li;
+       int status;
+
+       /*
+        * Validate things that we need to have correct.
+        */
+       if (!kparams->castaddr)
+               return -EINVAL;
+
+       if (!kparams->lksb)
+               return -EINVAL;
+
+       /* Persistent child locks are not available yet */
+       if ((kparams->flags & DLM_LKF_PERSISTENT) && kparams->parent)
+               return -EINVAL;
+
+        /* For conversions, there should already be a lockinfo struct,
+          unless we are adopting an orphaned persistent lock */
+       if (kparams->flags & DLM_LKF_CONVERT) {
+
+               li = get_lockinfo(kparams->lkid);
+
+               /* If this is a persistent lock we will have to create a
+                  lockinfo again */
+               if (!li && DLM_LKF_PERSISTENT) {
+                       li = allocate_lockinfo(fi, cmd, kparams);
+
+                       li->li_lksb.sb_lkid = kparams->lkid;
+                       li->li_castaddr  = kparams->castaddr;
+                       li->li_castparam = kparams->castparam;
+
+                       /* OK, this isn;t exactly a FIRSTLOCK but it is the
+                          first time we've used this lockinfo, and if things
+                          fail we want rid of it */
+                       init_MUTEX_LOCKED(&li->li_firstlock);
+                       set_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
+                       add_lockinfo(li);
+
+                       /* TODO: do a query to get the current state ?? */
+               }
+               if (!li)
+                       return -EINVAL;
+
+               if (li->li_magic != LOCKINFO_MAGIC)
+                       return -EINVAL;
+
+               /* For conversions don't overwrite the current blocking AST
+                  info so that:
+                  a) if a blocking AST fires before the conversion is queued
+                     it runs the current handler
+                  b) if the conversion is cancelled, the original blocking AST
+                     declaration is active
+                  The pend_ info is made active when the conversion
+                  completes.
+               */
+               li->li_pend_bastaddr  = kparams->bastaddr;
+               li->li_pend_bastparam = kparams->bastparam;
+       } else {
+               li = allocate_lockinfo(fi, cmd, kparams);
+               if (!li)
+                       return -ENOMEM;
+
+               /* semaphore to allow us to complete our work before
+                  the AST routine runs. In fact we only need (and use) this
+                  when the initial lock fails */
+               init_MUTEX_LOCKED(&li->li_firstlock);
+               set_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
+       }
+
+       li->li_user_lksb = kparams->lksb;
+       li->li_castaddr  = kparams->castaddr;
+       li->li_castparam = kparams->castparam;
+       li->li_lksb.sb_lkid = kparams->lkid;
+       li->li_rqmode    = kparams->mode;
+       if (kparams->flags & DLM_LKF_PERSISTENT)
+               set_bit(LI_FLAG_PERSISTENT, &li->li_flags);
+
+       /* Copy in the value block */
+       if (kparams->flags & DLM_LKF_VALBLK) {
+               if (!li->li_lksb.sb_lvbptr) {
+                       li->li_lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN,
+                                                       GFP_KERNEL);
+                       if (!li->li_lksb.sb_lvbptr) {
+                               status = -ENOMEM;
+                               goto out_err;
+                       }
+               }
+
+               memcpy(li->li_lksb.sb_lvbptr, kparams->lvb, DLM_USER_LVB_LEN);
+       }
+
+       /* Lock it ... */
+       status = dlm_lock(fi->fi_ls->ls_lockspace,
+                         kparams->mode, &li->li_lksb,
+                         kparams->flags,
+                         kparams->name, kparams->namelen,
+                         kparams->parent,
+                         ast_routine,
+                         li,
+                         (li->li_pend_bastaddr || li->li_bastaddr) ?
+                          bast_routine : NULL,
+                         kparams->range.ra_end ? &kparams->range : NULL);
+       if (status)
+               goto out_err;
+
+       /* If it succeeded (this far) with a new lock then keep track of
+          it on the file's lockinfo list */
+       if (!status && test_bit(LI_FLAG_FIRSTLOCK, &li->li_flags)) {
+
+               spin_lock(&fi->fi_li_lock);
+               list_add(&li->li_ownerqueue, &fi->fi_li_list);
+               spin_unlock(&fi->fi_li_lock);
+               if (add_lockinfo(li))
+                       printk(KERN_WARNING "Add lockinfo failed\n");
+
+               up(&li->li_firstlock);
+       }
+
+       /* Return the lockid as the user needs it /now/ */
+       return li->li_lksb.sb_lkid;
+
+ out_err:
+       if (test_bit(LI_FLAG_FIRSTLOCK, &li->li_flags))
+               release_lockinfo(li);
+       return status;
+
+}
+
+static int do_user_unlock(struct file_info *fi, uint8_t cmd,
+                         struct dlm_lock_params *kparams)
+{
+       struct lock_info *li;
+       int status;
+       int convert_cancel = 0;
+
+       li = get_lockinfo(kparams->lkid);
+       if (!li) {
+               li = allocate_lockinfo(fi, cmd, kparams);
+               spin_lock(&fi->fi_li_lock);
+               list_add(&li->li_ownerqueue, &fi->fi_li_list);
+               spin_unlock(&fi->fi_li_lock);
+       }
+       if (!li)
+               return -ENOMEM;
+
+       if (li->li_magic != LOCKINFO_MAGIC)
+               return -EINVAL;
+
+       li->li_user_lksb = kparams->lksb;
+       li->li_castparam = kparams->castparam;
+       li->li_cmd       = cmd;
+
+       /* Cancelling a conversion doesn't remove the lock...*/
+       if (kparams->flags & DLM_LKF_CANCEL && li->li_grmode != -1)
+               convert_cancel = 1;
+
+       /* dlm_unlock() passes a 0 for castaddr which means don't overwrite
+          the existing li_castaddr as that's the completion routine for
+          unlocks. dlm_unlock_wait() specifies a new AST routine to be
+          executed when the unlock completes. */
+       if (kparams->castaddr)
+               li->li_castaddr = kparams->castaddr;
+
+       /* Use existing lksb & astparams */
+       status = dlm_unlock(fi->fi_ls->ls_lockspace,
+                            kparams->lkid,
+                            kparams->flags, &li->li_lksb, li);
+
+       if (!status && !convert_cancel) {
+               spin_lock(&fi->fi_li_lock);
+               list_del(&li->li_ownerqueue);
+               spin_unlock(&fi->fi_li_lock);
+       }
+
+       return status;
+}
+
+/* Write call, submit a locking request */
+static ssize_t dlm_write(struct file *file, const char __user *buffer,
+                        size_t count, loff_t *ppos)
+{
+       struct file_info *fi = file->private_data;
+       struct dlm_write_request *kparams;
+       sigset_t tmpsig;
+       sigset_t allsigs;
+       int status;
+
+       /* -1 because lock name is optional */
+       if (count < sizeof(struct dlm_write_request)-1)
+               return -EINVAL;
+
+       /* Has the lockspace been deleted */
+       if (fi && test_bit(LS_FLAG_DELETED, &fi->fi_ls->ls_flags))
+               return -ENOENT;
+
+       kparams = kmalloc(count, GFP_KERNEL);
+       if (!kparams)
+               return -ENOMEM;
+
+       status = -EFAULT;
+       /* Get the command info */
+       if (copy_from_user(kparams, buffer, count))
+               goto out_free;
+
+       status = -EBADE;
+       if (check_version(kparams))
+               goto out_free;
+
+       /* Block signals while we are doing this */
+       sigfillset(&allsigs);
+       sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
+
+       status = -EINVAL;
+       switch (kparams->cmd)
+       {
+       case DLM_USER_LOCK:
+               if (!fi) goto out_sig;
+               status = do_user_lock(fi, kparams->cmd, &kparams->i.lock);
+               break;
+
+       case DLM_USER_UNLOCK:
+               if (!fi) goto out_sig;
+               status = do_user_unlock(fi, kparams->cmd, &kparams->i.lock);
+               break;
+
+       case DLM_USER_CREATE_LOCKSPACE:
+               if (fi) goto out_sig;
+               status = do_user_create_lockspace(fi, kparams->cmd,
+                                                 &kparams->i.lspace);
+               break;
+
+       case DLM_USER_REMOVE_LOCKSPACE:
+               if (fi) goto out_sig;
+               status = do_user_remove_lockspace(fi, kparams->cmd,
+                                                 &kparams->i.lspace);
+               break;
+       default:
+               printk("Unknown command passed to DLM device : %d\n",
+                       kparams->cmd);
+               break;
+       }
+
+ out_sig:
+       /* Restore signals */
+       sigprocmask(SIG_SETMASK, &tmpsig, NULL);
+       recalc_sigpending();
+
+ out_free:
+       kfree(kparams);
+       if (status == 0)
+               return count;
+       else
+               return status;
+}
+
+static struct file_operations _dlm_fops = {
+      .open    = dlm_open,
+      .release = dlm_close,
+      .read    = dlm_read,
+      .write   = dlm_write,
+      .poll    = dlm_poll,
+      .owner   = THIS_MODULE,
+};
+
+static struct file_operations _dlm_ctl_fops = {
+      .open    = dlm_ctl_open,
+      .release = dlm_ctl_close,
+      .write   = dlm_write,
+      .owner   = THIS_MODULE,
+};
+
+/*
+ * Create control device
+ */
+static int __init dlm_device_init(void)
+{
+       int r;
+
+       INIT_LIST_HEAD(&user_ls_list);
+       init_MUTEX(&user_ls_lock);
+       rwlock_init(&lockinfo_lock);
+
+       ctl_device.name = "dlm-control";
+       ctl_device.fops = &_dlm_ctl_fops;
+       ctl_device.minor = MISC_DYNAMIC_MINOR;
+
+       r = misc_register(&ctl_device);
+       if (r) {
+               printk(KERN_ERR "dlm: misc_register failed for control dev\n");
+               return r;
+       }
+
+       return 0;
+}
+
+static void __exit dlm_device_exit(void)
+{
+       misc_deregister(&ctl_device);
+}
+
+MODULE_DESCRIPTION("Distributed Lock Manager device interface");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+module_init(dlm_device_init);
+module_exit(dlm_device_exit);
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
new file mode 100644 (file)
index 0000000..0f1dde5
--- /dev/null
@@ -0,0 +1,423 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "lowcomms.h"
+#include "rcom.h"
+#include "config.h"
+#include "memory.h"
+#include "recover.h"
+#include "util.h"
+#include "lock.h"
+#include "dir.h"
+
+
+static void put_free_de(struct dlm_ls *ls, struct dlm_direntry *de)
+{
+       spin_lock(&ls->ls_recover_list_lock);
+       list_add(&de->list, &ls->ls_recover_list);
+       spin_unlock(&ls->ls_recover_list_lock);
+}
+
+static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
+{
+       int found = FALSE;
+       struct dlm_direntry *de;
+
+       spin_lock(&ls->ls_recover_list_lock);
+       list_for_each_entry(de, &ls->ls_recover_list, list) {
+               if (de->length == len) {
+                       list_del(&de->list);
+                       de->master_nodeid = 0;
+                       memset(de->name, 0, len);
+                       found = TRUE;
+                       break;
+               }
+       }
+       spin_unlock(&ls->ls_recover_list_lock);
+
+       if (!found)
+               de = allocate_direntry(ls, len);
+       return de;
+}
+
+void dlm_clear_free_entries(struct dlm_ls *ls)
+{
+       struct dlm_direntry *de;
+
+       spin_lock(&ls->ls_recover_list_lock);
+       while (!list_empty(&ls->ls_recover_list)) {
+               de = list_entry(ls->ls_recover_list.next, struct dlm_direntry,
+                               list);
+               list_del(&de->list);
+               free_direntry(de);
+       }
+       spin_unlock(&ls->ls_recover_list_lock);
+}
+
+/*
+ * We use the upper 16 bits of the hash value to select the directory node.
+ * Low bits are used for distribution of rsb's among hash buckets on each node.
+ *
+ * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of
+ * num_nodes to the hash value.  This value in the desired range is used as an
+ * offset into the sorted list of nodeid's to give the particular nodeid.
+ */
+
+int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash)
+{
+       struct list_head *tmp;
+       struct dlm_member *memb = NULL;
+       uint32_t node, n = 0;
+       int nodeid;
+
+       if (ls->ls_num_nodes == 1) {
+               nodeid = dlm_our_nodeid();
+               goto out;
+       }
+
+       if (ls->ls_node_array) {
+               node = (hash >> 16) % ls->ls_total_weight;
+               nodeid = ls->ls_node_array[node];
+               goto out;
+       }
+
+       /* make_member_array() failed to kmalloc ls_node_array... */
+
+       node = (hash >> 16) % ls->ls_num_nodes;
+
+       list_for_each(tmp, &ls->ls_nodes) {
+               if (n++ != node)
+                       continue;
+               memb = list_entry(tmp, struct dlm_member, list);
+               break;
+       }
+
+       DLM_ASSERT(memb , printk("num_nodes=%u n=%u node=%u\n",
+                                ls->ls_num_nodes, n, node););
+       nodeid = memb->nodeid;
+ out:
+       return nodeid;
+}
+
+int dlm_dir_nodeid(struct dlm_rsb *r)
+{
+       return dlm_hash2nodeid(r->res_ls, r->res_hash);
+}
+
+static inline uint32_t dir_hash(struct dlm_ls *ls, char *name, int len)
+{
+       uint32_t val;
+
+       val = jhash(name, len, 0);
+       val &= (ls->ls_dirtbl_size - 1);
+
+       return val;
+}
+
+static void add_entry_to_hash(struct dlm_ls *ls, struct dlm_direntry *de)
+{
+       uint32_t bucket;
+
+       bucket = dir_hash(ls, de->name, de->length);
+       list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
+}
+
+static struct dlm_direntry *search_bucket(struct dlm_ls *ls, char *name,
+                                         int namelen, uint32_t bucket)
+{
+       struct dlm_direntry *de;
+
+       list_for_each_entry(de, &ls->ls_dirtbl[bucket].list, list) {
+               if (de->length == namelen && !memcmp(name, de->name, namelen))
+                       goto out;
+       }
+       de = NULL;
+ out:
+       return de;
+}
+
+void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen)
+{
+       struct dlm_direntry *de;
+       uint32_t bucket;
+
+       bucket = dir_hash(ls, name, namelen);
+
+       write_lock(&ls->ls_dirtbl[bucket].lock);
+
+       de = search_bucket(ls, name, namelen, bucket);
+
+       if (!de) {
+               log_error(ls, "remove fr %u none", nodeid);
+               goto out;
+       }
+
+       if (de->master_nodeid != nodeid) {
+               log_error(ls, "remove fr %u ID %u", nodeid, de->master_nodeid);
+               goto out;
+       }
+
+       list_del(&de->list);
+       free_direntry(de);
+ out:
+       write_unlock(&ls->ls_dirtbl[bucket].lock);
+}
+
+void dlm_dir_clear(struct dlm_ls *ls)
+{
+       struct list_head *head;
+       struct dlm_direntry *de;
+       int i;
+
+       DLM_ASSERT(list_empty(&ls->ls_recover_list), );
+
+       for (i = 0; i < ls->ls_dirtbl_size; i++) {
+               write_lock(&ls->ls_dirtbl[i].lock);
+               head = &ls->ls_dirtbl[i].list;
+               while (!list_empty(head)) {
+                       de = list_entry(head->next, struct dlm_direntry, list);
+                       list_del(&de->list);
+                       put_free_de(ls, de);
+               }
+               write_unlock(&ls->ls_dirtbl[i].lock);
+       }
+}
+
+int dlm_recover_directory(struct dlm_ls *ls)
+{
+       struct dlm_member *memb;
+       struct dlm_direntry *de;
+       char *b, *last_name = NULL;
+       int error = -ENOMEM, last_len, count = 0;
+       uint16_t namelen;
+
+       log_debug(ls, "dlm_recover_directory");
+
+       if (dlm_no_directory(ls))
+               goto out_status;
+
+       dlm_dir_clear(ls);
+
+       last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL);
+       if (!last_name)
+               goto out;
+
+       list_for_each_entry(memb, &ls->ls_nodes, list) {
+               memset(last_name, 0, DLM_RESNAME_MAXLEN);
+               last_len = 0;
+
+               for (;;) {
+                       error = dlm_recovery_stopped(ls);
+                       if (error)
+                               goto out_free;
+
+                       error = dlm_rcom_names(ls, memb->nodeid,
+                                              last_name, last_len);
+                       if (error)
+                               goto out_free;
+
+                       schedule();
+
+                       /*
+                        * pick namelen/name pairs out of received buffer
+                        */
+
+                       b = ls->ls_recover_buf + sizeof(struct dlm_rcom);
+
+                       for (;;) {
+                               memcpy(&namelen, b, sizeof(uint16_t));
+                               namelen = be16_to_cpu(namelen);
+                               b += sizeof(uint16_t);
+
+                               /* namelen of 0xFFFFF marks end of names for
+                                  this node; namelen of 0 marks end of the
+                                  buffer */
+
+                               if (namelen == 0xFFFF)
+                                       goto done;
+                               if (!namelen)
+                                       break;
+
+                               error = -ENOMEM;
+                               de = get_free_de(ls, namelen);
+                               if (!de)
+                                       goto out_free;
+
+                               de->master_nodeid = memb->nodeid;
+                               de->length = namelen;
+                               last_len = namelen;
+                               memcpy(de->name, b, namelen);
+                               memcpy(last_name, b, namelen);
+                               b += namelen;
+
+                               add_entry_to_hash(ls, de);
+                               count++;
+                       }
+               }
+         done:
+               ;
+       }
+
+ out_status:
+       error = 0;
+       dlm_set_recover_status(ls, DLM_RS_DIR);
+       log_debug(ls, "dlm_recover_directory %d entries", count);
+ out_free:
+       kfree(last_name);
+ out:
+       dlm_clear_free_entries(ls);
+       return error;
+}
+
+static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
+                    int namelen, int *r_nodeid)
+{
+       struct dlm_direntry *de, *tmp;
+       uint32_t bucket;
+
+       bucket = dir_hash(ls, name, namelen);
+
+       write_lock(&ls->ls_dirtbl[bucket].lock);
+       de = search_bucket(ls, name, namelen, bucket);
+       if (de) {
+               *r_nodeid = de->master_nodeid;
+               write_unlock(&ls->ls_dirtbl[bucket].lock);
+               if (*r_nodeid == nodeid)
+                       return -EEXIST;
+               return 0;
+       }
+
+       write_unlock(&ls->ls_dirtbl[bucket].lock);
+
+       de = allocate_direntry(ls, namelen);
+       if (!de)
+               return -ENOMEM;
+
+       de->master_nodeid = nodeid;
+       de->length = namelen;
+       memcpy(de->name, name, namelen);
+
+       write_lock(&ls->ls_dirtbl[bucket].lock);
+       tmp = search_bucket(ls, name, namelen, bucket);
+       if (tmp) {
+               free_direntry(de);
+               de = tmp;
+       } else {
+               list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
+       }
+       *r_nodeid = de->master_nodeid;
+       write_unlock(&ls->ls_dirtbl[bucket].lock);
+       return 0;
+}
+
+int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
+                  int *r_nodeid)
+{
+       return get_entry(ls, nodeid, name, namelen, r_nodeid);
+}
+
+/* Copy the names of master rsb's into the buffer provided.
+   Only select names whose dir node is the given nodeid. */
+
+void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
+                          char *outbuf, int outlen, int nodeid)
+{
+       struct list_head *list;
+       struct dlm_rsb *start_r = NULL, *r = NULL;
+       int offset = 0, start_namelen, error, dir_nodeid;
+       char *start_name;
+       uint16_t be_namelen;
+
+       /*
+        * Find the rsb where we left off (or start again)
+        */
+
+       start_namelen = inlen;
+       start_name = inbuf;
+
+       if (start_namelen > 1) {
+               /*
+                * We could also use a find_rsb_root() function here that
+                * searched the ls_root_list.
+                */
+               error = dlm_find_rsb(ls, start_name, start_namelen, R_MASTER,
+                                    &start_r);
+               DLM_ASSERT(!error && start_r,
+                          printk("error %d\n", error););
+               DLM_ASSERT(!list_empty(&start_r->res_root_list),
+                          dlm_print_rsb(start_r););
+               dlm_put_rsb(start_r);
+       }
+
+       /*
+        * Send rsb names for rsb's we're master of and whose directory node
+        * matches the requesting node.
+        */
+
+       down_read(&ls->ls_root_sem);
+       if (start_r)
+               list = start_r->res_root_list.next;
+       else
+               list = ls->ls_root_list.next;
+
+       for (offset = 0; list != &ls->ls_root_list; list = list->next) {
+               r = list_entry(list, struct dlm_rsb, res_root_list);
+               if (r->res_nodeid)
+                       continue;
+
+               dir_nodeid = dlm_dir_nodeid(r);
+               if (dir_nodeid != nodeid)
+                       continue;
+
+               /*
+                * The block ends when we can't fit the following in the
+                * remaining buffer space:
+                * namelen (uint16_t) +
+                * name (r->res_length) +
+                * end-of-block record 0x0000 (uint16_t)
+                */
+
+               if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) {
+                       /* Write end-of-block record */
+                       be_namelen = 0;
+                       memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
+                       offset += sizeof(uint16_t);
+                       goto out;
+               }
+
+               be_namelen = cpu_to_be16(r->res_length);
+               memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
+               offset += sizeof(uint16_t);
+               memcpy(outbuf + offset, r->res_name, r->res_length);
+               offset += r->res_length;
+       }
+
+       /*
+        * If we've reached the end of the list (and there's room) write a
+        * terminating record.
+        */
+
+       if ((list == &ls->ls_root_list) &&
+           (offset + sizeof(uint16_t) <= outlen)) {
+               be_namelen = 0xFFFF;
+               memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
+               offset += sizeof(uint16_t);
+       }
+
+ out:
+       up_read(&ls->ls_root_sem);
+}
+
diff --git a/fs/dlm/dir.h b/fs/dlm/dir.h
new file mode 100644 (file)
index 0000000..0b0eb12
--- /dev/null
@@ -0,0 +1,30 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __DIR_DOT_H__
+#define __DIR_DOT_H__
+
+
+int dlm_dir_nodeid(struct dlm_rsb *rsb);
+int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash);
+void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int len);
+void dlm_dir_clear(struct dlm_ls *ls);
+void dlm_clear_free_entries(struct dlm_ls *ls);
+int dlm_recover_directory(struct dlm_ls *ls);
+int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
+       int *r_nodeid);
+void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
+       char *outbuf, int outlen, int nodeid);
+
+#endif                         /* __DIR_DOT_H__ */
+
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
new file mode 100644 (file)
index 0000000..0020cd0
--- /dev/null
@@ -0,0 +1,518 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __DLM_INTERNAL_DOT_H__
+#define __DLM_INTERNAL_DOT_H__
+
+/*
+ * This is the main header file to be included in each DLM source file.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <linux/list.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <linux/delay.h>
+#include <linux/socket.h>
+#include <linux/kthread.h>
+#include <linux/kobject.h>
+#include <linux/kref.h>
+#include <linux/kernel.h>
+#include <linux/jhash.h>
+#include <asm/semaphore.h>
+#include <asm/uaccess.h>
+
+#include <linux/dlm.h>
+
+#define DLM_LOCKSPACE_LEN      64
+
+#ifndef TRUE
+#define TRUE 1
+#endif
+
+#ifndef FALSE
+#define FALSE 0
+#endif
+
+#if (BITS_PER_LONG == 64)
+#define PRIx64 "lx"
+#else
+#define PRIx64 "Lx"
+#endif
+
+/* Size of the temp buffer midcomms allocates on the stack.
+   We try to make this large enough so most messages fit.
+   FIXME: should sctp make this unnecessary? */
+
+#define DLM_INBUF_LEN          148
+
+struct dlm_ls;
+struct dlm_lkb;
+struct dlm_rsb;
+struct dlm_member;
+struct dlm_lkbtable;
+struct dlm_rsbtable;
+struct dlm_dirtable;
+struct dlm_direntry;
+struct dlm_recover;
+struct dlm_header;
+struct dlm_message;
+struct dlm_rcom;
+struct dlm_mhandle;
+
+#define log_print(fmt, args...) \
+       printk(KERN_ERR "dlm: "fmt"\n" , ##args)
+#define log_error(ls, fmt, args...) \
+       printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args)
+
+#ifdef DLM_LOG_DEBUG
+#define log_debug(ls, fmt, args...) log_error(ls, fmt, ##args)
+#else
+#define log_debug(ls, fmt, args...)
+#endif
+
+#define DLM_ASSERT(x, do) \
+{ \
+  if (!(x)) \
+  { \
+    printk(KERN_ERR "\nDLM:  Assertion failed on line %d of file %s\n" \
+               "DLM:  assertion:  \"%s\"\n" \
+               "DLM:  time = %lu\n", \
+               __LINE__, __FILE__, #x, jiffies); \
+    {do} \
+    printk("\n"); \
+    BUG(); \
+    panic("DLM:  Record message above and reboot.\n"); \
+  } \
+}
+
+
+struct dlm_direntry {
+       struct list_head        list;
+       uint32_t                master_nodeid;
+       uint16_t                length;
+       char                    name[1];
+};
+
+struct dlm_dirtable {
+       struct list_head        list;
+       rwlock_t                lock;
+};
+
+struct dlm_rsbtable {
+       struct list_head        list;
+       struct list_head        toss;
+       rwlock_t                lock;
+};
+
+struct dlm_lkbtable {
+       struct list_head        list;
+       rwlock_t                lock;
+       uint16_t                counter;
+};
+
+/*
+ * Lockspace member (per node in a ls)
+ */
+
+struct dlm_member {
+       struct list_head        list;
+       int                     nodeid;
+       int                     weight;
+};
+
+/*
+ * Save and manage recovery state for a lockspace.
+ */
+
+struct dlm_recover {
+       struct list_head        list;
+       int                     *nodeids;
+       int                     node_count;
+       uint64_t                seq;
+};
+
+/*
+ * Pass input args to second stage locking function.
+ */
+
+struct dlm_args {
+       uint32_t                flags;
+       void                    *astaddr;
+       long                    astparam;
+       void                    *bastaddr;
+       int                     mode;
+       struct dlm_lksb         *lksb;
+       struct dlm_range        *range;
+};
+
+
+/*
+ * Lock block
+ *
+ * A lock can be one of three types:
+ *
+ * local copy      lock is mastered locally
+ *                 (lkb_nodeid is zero and DLM_LKF_MSTCPY is not set)
+ * process copy    lock is mastered on a remote node
+ *                 (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is not set)
+ * master copy     master node's copy of a lock owned by remote node
+ *                 (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is set)
+ *
+ * lkb_exflags: a copy of the most recent flags arg provided to dlm_lock or
+ * dlm_unlock.  The dlm does not modify these or use any private flags in
+ * this field; it only contains DLM_LKF_ flags from dlm.h.  These flags
+ * are sent as-is to the remote master when the lock is remote.
+ *
+ * lkb_flags: internal dlm flags (DLM_IFL_ prefix) from dlm_internal.h.
+ * Some internal flags are shared between the master and process nodes;
+ * these shared flags are kept in the lower two bytes.  One of these
+ * flags set on the master copy will be propagated to the process copy
+ * and v.v.  Other internal flags are private to the master or process
+ * node (e.g. DLM_IFL_MSTCPY).  These are kept in the high two bytes.
+ *
+ * lkb_sbflags: status block flags.  These flags are copied directly into
+ * the caller's lksb.sb_flags prior to the dlm_lock/dlm_unlock completion
+ * ast.  All defined in dlm.h with DLM_SBF_ prefix.
+ *
+ * lkb_status: the lock status indicates which rsb queue the lock is
+ * on, grant, convert, or wait.  DLM_LKSTS_ WAITING/GRANTED/CONVERT
+ *
+ * lkb_wait_type: the dlm message type (DLM_MSG_ prefix) for which a
+ * reply is needed.  Only set when the lkb is on the lockspace waiters
+ * list awaiting a reply from a remote node.
+ *
+ * lkb_nodeid: when the lkb is a local copy, nodeid is 0; when the lkb
+ * is a master copy, nodeid specifies the remote lock holder, when the
+ * lkb is a process copy, the nodeid specifies the lock master.
+ */
+
+/* lkb_ast_type */
+
+#define AST_COMP               1
+#define AST_BAST               2
+
+/* lkb_range[] */
+
+#define GR_RANGE_START         0
+#define GR_RANGE_END           1
+#define RQ_RANGE_START         2
+#define RQ_RANGE_END           3
+
+/* lkb_status */
+
+#define DLM_LKSTS_WAITING      1
+#define DLM_LKSTS_GRANTED      2
+#define DLM_LKSTS_CONVERT      3
+
+/* lkb_flags */
+
+#define DLM_IFL_MSTCPY         0x00010000
+#define DLM_IFL_RESEND         0x00020000
+#define DLM_IFL_RANGE          0x00000001
+
+struct dlm_lkb {
+       struct dlm_rsb          *lkb_resource;  /* the rsb */
+       struct kref             lkb_ref;
+       int                     lkb_nodeid;     /* copied from rsb */
+       int                     lkb_ownpid;     /* pid of lock owner */
+       uint32_t                lkb_id;         /* our lock ID */
+       uint32_t                lkb_remid;      /* lock ID on remote partner */
+       uint32_t                lkb_exflags;    /* external flags from caller */
+       uint32_t                lkb_sbflags;    /* lksb flags */
+       uint32_t                lkb_flags;      /* internal flags */
+       uint32_t                lkb_lvbseq;     /* lvb sequence number */
+
+       int8_t                  lkb_status;     /* granted, waiting, convert */
+       int8_t                  lkb_rqmode;     /* requested lock mode */
+       int8_t                  lkb_grmode;     /* granted lock mode */
+       int8_t                  lkb_bastmode;   /* requested mode */
+       int8_t                  lkb_highbast;   /* highest mode bast sent for */
+
+       int8_t                  lkb_wait_type;  /* type of reply waiting for */
+       int8_t                  lkb_ast_type;   /* type of ast queued for */
+
+       struct list_head        lkb_idtbl_list; /* lockspace lkbtbl */
+       struct list_head        lkb_statequeue; /* rsb g/c/w list */
+       struct list_head        lkb_rsb_lookup; /* waiting for rsb lookup */
+       struct list_head        lkb_wait_reply; /* waiting for remote reply */
+       struct list_head        lkb_astqueue;   /* need ast to be sent */
+
+       uint64_t                *lkb_range;     /* array of gr/rq ranges */
+       char                    *lkb_lvbptr;
+       struct dlm_lksb         *lkb_lksb;      /* caller's status block */
+       void                    *lkb_astaddr;   /* caller's ast function */
+       void                    *lkb_bastaddr;  /* caller's bast function */
+       long                    lkb_astparam;   /* caller's ast arg */
+};
+
+
+struct dlm_rsb {
+       struct dlm_ls           *res_ls;        /* the lockspace */
+       struct kref             res_ref;
+       struct semaphore        res_sem;
+       unsigned long           res_flags;
+       int                     res_length;     /* length of rsb name */
+       int                     res_nodeid;
+       uint32_t                res_lvbseq;
+       uint32_t                res_hash;
+       uint32_t                res_bucket;     /* rsbtbl */
+       unsigned long           res_toss_time;
+       uint32_t                res_first_lkid;
+       struct list_head        res_lookup;     /* lkbs waiting on first */
+       struct list_head        res_hashchain;  /* rsbtbl */
+       struct list_head        res_grantqueue;
+       struct list_head        res_convertqueue;
+       struct list_head        res_waitqueue;
+
+       struct list_head        res_root_list;      /* used for recovery */
+       struct list_head        res_recover_list;   /* used for recovery */
+       int                     res_recover_locks_count;
+
+       char                    *res_lvbptr;
+       char                    res_name[1];
+};
+
+/* find_rsb() flags */
+
+#define R_MASTER               1       /* only return rsb if it's a master */
+#define R_CREATE               2       /* create/add rsb if not found */
+
+/* rsb_flags */
+
+enum rsb_flags {
+       RSB_MASTER_UNCERTAIN,
+       RSB_VALNOTVALID,
+       RSB_VALNOTVALID_PREV,
+       RSB_NEW_MASTER,
+       RSB_NEW_MASTER2,
+       RSB_RECOVER_CONVERT,
+};
+
+static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag)
+{
+       __set_bit(flag, &r->res_flags);
+}
+
+static inline void rsb_clear_flag(struct dlm_rsb *r, enum rsb_flags flag)
+{
+       __clear_bit(flag, &r->res_flags);
+}
+
+static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag)
+{
+       return test_bit(flag, &r->res_flags);
+}
+
+
+/* dlm_header is first element of all structs sent between nodes */
+
+#define DLM_HEADER_MAJOR       0x00020000
+#define DLM_HEADER_MINOR       0x00000001
+
+#define DLM_MSG                        1
+#define DLM_RCOM               2
+
+struct dlm_header {
+       uint32_t                h_version;
+       uint32_t                h_lockspace;
+       uint32_t                h_nodeid;       /* nodeid of sender */
+       uint16_t                h_length;
+       uint8_t                 h_cmd;          /* DLM_MSG, DLM_RCOM */
+       uint8_t                 h_pad;
+};
+
+
+#define DLM_MSG_REQUEST                1
+#define DLM_MSG_CONVERT                2
+#define DLM_MSG_UNLOCK         3
+#define DLM_MSG_CANCEL         4
+#define DLM_MSG_REQUEST_REPLY  5
+#define DLM_MSG_CONVERT_REPLY  6
+#define DLM_MSG_UNLOCK_REPLY   7
+#define DLM_MSG_CANCEL_REPLY   8
+#define DLM_MSG_GRANT          9
+#define DLM_MSG_BAST           10
+#define DLM_MSG_LOOKUP         11
+#define DLM_MSG_REMOVE         12
+#define DLM_MSG_LOOKUP_REPLY   13
+
+struct dlm_message {
+       struct dlm_header       m_header;
+       uint32_t                m_type;         /* DLM_MSG_ */
+       uint32_t                m_nodeid;
+       uint32_t                m_pid;
+       uint32_t                m_lkid;         /* lkid on sender */
+       uint32_t                m_remid;        /* lkid on receiver */
+       uint32_t                m_parent_lkid;
+       uint32_t                m_parent_remid;
+       uint32_t                m_exflags;
+       uint32_t                m_sbflags;
+       uint32_t                m_flags;
+       uint32_t                m_lvbseq;
+       uint32_t                m_hash;
+       int                     m_status;
+       int                     m_grmode;
+       int                     m_rqmode;
+       int                     m_bastmode;
+       int                     m_asts;
+       int                     m_result;       /* 0 or -EXXX */
+       uint64_t                m_range[2];
+       char                    m_extra[0];     /* name or lvb */
+};
+
+
+#define DLM_RS_NODES           0x00000001
+#define DLM_RS_NODES_ALL       0x00000002
+#define DLM_RS_DIR             0x00000004
+#define DLM_RS_DIR_ALL         0x00000008
+#define DLM_RS_LOCKS           0x00000010
+#define DLM_RS_LOCKS_ALL       0x00000020
+#define DLM_RS_DONE            0x00000040
+#define DLM_RS_DONE_ALL                0x00000080
+
+#define DLM_RCOM_STATUS                1
+#define DLM_RCOM_NAMES         2
+#define DLM_RCOM_LOOKUP                3
+#define DLM_RCOM_LOCK          4
+#define DLM_RCOM_STATUS_REPLY  5
+#define DLM_RCOM_NAMES_REPLY   6
+#define DLM_RCOM_LOOKUP_REPLY  7
+#define DLM_RCOM_LOCK_REPLY    8
+
+struct dlm_rcom {
+       struct dlm_header       rc_header;
+       uint32_t                rc_type;        /* DLM_RCOM_ */
+       int                     rc_result;      /* multi-purpose */
+       uint64_t                rc_id;          /* match reply with request */
+       char                    rc_buf[0];
+};
+
+struct rcom_config {
+       uint32_t                rf_lvblen;
+       uint32_t                rf_lsflags;
+       uint64_t                rf_unused;
+};
+
+struct rcom_lock {
+       uint32_t                rl_ownpid;
+       uint32_t                rl_lkid;
+       uint32_t                rl_remid;
+       uint32_t                rl_parent_lkid;
+       uint32_t                rl_parent_remid;
+       uint32_t                rl_exflags;
+       uint32_t                rl_flags;
+       uint32_t                rl_lvbseq;
+       int                     rl_result;
+       int8_t                  rl_rqmode;
+       int8_t                  rl_grmode;
+       int8_t                  rl_status;
+       int8_t                  rl_asts;
+       uint16_t                rl_wait_type;
+       uint16_t                rl_namelen;
+       uint64_t                rl_range[4];
+       char                    rl_name[DLM_RESNAME_MAXLEN];
+       char                    rl_lvb[0];
+};
+
+struct dlm_ls {
+       struct list_head        ls_list;        /* list of lockspaces */
+       uint32_t                ls_global_id;   /* global unique lockspace ID */
+       uint32_t                ls_exflags;
+       int                     ls_lvblen;
+       int                     ls_count;       /* reference count */
+       unsigned long           ls_flags;       /* LSFL_ */
+       struct kobject          ls_kobj;
+
+       struct dlm_rsbtable     *ls_rsbtbl;
+       uint32_t                ls_rsbtbl_size;
+
+       struct dlm_lkbtable     *ls_lkbtbl;
+       uint32_t                ls_lkbtbl_size;
+
+       struct dlm_dirtable     *ls_dirtbl;
+       uint32_t                ls_dirtbl_size;
+
+       struct semaphore        ls_waiters_sem;
+       struct list_head        ls_waiters;     /* lkbs needing a reply */
+
+       struct list_head        ls_nodes;       /* current nodes in ls */
+       struct list_head        ls_nodes_gone;  /* dead node list, recovery */
+       int                     ls_num_nodes;   /* number of nodes in ls */
+       int                     ls_low_nodeid;
+       int                     ls_total_weight;
+       int                     *ls_node_array;
+
+       struct dlm_rsb          ls_stub_rsb;    /* for returning errors */
+       struct dlm_lkb          ls_stub_lkb;    /* for returning errors */
+       struct dlm_message      ls_stub_ms;     /* for faking a reply */
+
+       struct dentry           *ls_debug_dentry; /* debugfs */
+
+       wait_queue_head_t       ls_uevent_wait; /* user part of join/leave */
+       int                     ls_uevent_result;
+
+       /* recovery related */
+
+       struct timer_list       ls_timer;
+       struct task_struct      *ls_recoverd_task;
+       struct semaphore        ls_recoverd_active;
+       spinlock_t              ls_recover_lock;
+       uint32_t                ls_recover_status; /* DLM_RS_ */
+       uint64_t                ls_recover_seq;
+       struct dlm_recover      *ls_recover_args;
+       struct rw_semaphore     ls_in_recovery; /* block local requests */
+       struct list_head        ls_requestqueue;/* queue remote requests */
+       struct semaphore        ls_requestqueue_lock;
+       char                    *ls_recover_buf;
+       struct list_head        ls_recover_list;
+       spinlock_t              ls_recover_list_lock;
+       int                     ls_recover_list_count;
+       wait_queue_head_t       ls_wait_general;
+
+       struct list_head        ls_root_list;   /* root resources */
+       struct rw_semaphore     ls_root_sem;    /* protect root_list */
+
+       int                     ls_namelen;
+       char                    ls_name[1];
+};
+
+#define LSFL_WORK              0
+#define LSFL_RUNNING           1
+#define LSFL_RECOVERY_STOP     2
+#define LSFL_RCOM_READY                3
+#define LSFL_UEVENT_WAIT       4
+
+static inline int dlm_locking_stopped(struct dlm_ls *ls)
+{
+       return !test_bit(LSFL_RUNNING, &ls->ls_flags);
+}
+
+static inline int dlm_recovery_stopped(struct dlm_ls *ls)
+{
+       return test_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
+}
+
+static inline int dlm_no_directory(struct dlm_ls *ls)
+{
+       return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0;
+}
+
+#endif                         /* __DLM_INTERNAL_DOT_H__ */
+
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
new file mode 100644 (file)
index 0000000..81efb36
--- /dev/null
@@ -0,0 +1,3610 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/* Central locking logic has four stages:
+
+   dlm_lock()
+   dlm_unlock()
+
+   request_lock(ls, lkb)
+   convert_lock(ls, lkb)
+   unlock_lock(ls, lkb)
+   cancel_lock(ls, lkb)
+
+   _request_lock(r, lkb)
+   _convert_lock(r, lkb)
+   _unlock_lock(r, lkb)
+   _cancel_lock(r, lkb)
+
+   do_request(r, lkb)
+   do_convert(r, lkb)
+   do_unlock(r, lkb)
+   do_cancel(r, lkb)
+
+   Stage 1 (lock, unlock) is mainly about checking input args and
+   splitting into one of the four main operations:
+
+       dlm_lock          = request_lock
+       dlm_lock+CONVERT  = convert_lock
+       dlm_unlock        = unlock_lock
+       dlm_unlock+CANCEL = cancel_lock
+
+   Stage 2, xxxx_lock(), just finds and locks the relevant rsb which is
+   provided to the next stage.
+
+   Stage 3, _xxxx_lock(), determines if the operation is local or remote.
+   When remote, it calls send_xxxx(), when local it calls do_xxxx().
+
+   Stage 4, do_xxxx(), is the guts of the operation.  It manipulates the
+   given rsb and lkb and queues callbacks.
+
+   For remote operations, send_xxxx() results in the corresponding do_xxxx()
+   function being executed on the remote node.  The connecting send/receive
+   calls on local (L) and remote (R) nodes:
+
+   L: send_xxxx()              ->  R: receive_xxxx()
+                                   R: do_xxxx()
+   L: receive_xxxx_reply()     <-  R: send_xxxx_reply()
+*/
+
+#include "dlm_internal.h"
+#include "memory.h"
+#include "lowcomms.h"
+#include "requestqueue.h"
+#include "util.h"
+#include "dir.h"
+#include "member.h"
+#include "lockspace.h"
+#include "ast.h"
+#include "lock.h"
+#include "rcom.h"
+#include "recover.h"
+#include "lvb_table.h"
+#include "config.h"
+
+static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode);
+static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_remove(struct dlm_rsb *r);
+static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
+                                   struct dlm_message *ms);
+static int receive_extralen(struct dlm_message *ms);
+
+/*
+ * Lock compatibilty matrix - thanks Steve
+ * UN = Unlocked state. Not really a state, used as a flag
+ * PD = Padding. Used to make the matrix a nice power of two in size
+ * Other states are the same as the VMS DLM.
+ * Usage: matrix[grmode+1][rqmode+1]  (although m[rq+1][gr+1] is the same)
+ */
+
+static const int __dlm_compat_matrix[8][8] = {
+      /* UN NL CR CW PR PW EX PD */
+        {1, 1, 1, 1, 1, 1, 1, 0},       /* UN */
+        {1, 1, 1, 1, 1, 1, 1, 0},       /* NL */
+        {1, 1, 1, 1, 1, 1, 0, 0},       /* CR */
+        {1, 1, 1, 1, 0, 0, 0, 0},       /* CW */
+        {1, 1, 1, 0, 1, 0, 0, 0},       /* PR */
+        {1, 1, 1, 0, 0, 0, 0, 0},       /* PW */
+        {1, 1, 0, 0, 0, 0, 0, 0},       /* EX */
+        {0, 0, 0, 0, 0, 0, 0, 0}        /* PD */
+};
+
+/*
+ * This defines the direction of transfer of LVB data.
+ * Granted mode is the row; requested mode is the column.
+ * Usage: matrix[grmode+1][rqmode+1]
+ * 1 = LVB is returned to the caller
+ * 0 = LVB is written to the resource
+ * -1 = nothing happens to the LVB
+ */
+
+const int dlm_lvb_operations[8][8] = {
+        /* UN   NL  CR  CW  PR  PW  EX  PD*/
+        {  -1,  1,  1,  1,  1,  1,  1, -1 }, /* UN */
+        {  -1,  1,  1,  1,  1,  1,  1,  0 }, /* NL */
+        {  -1, -1,  1,  1,  1,  1,  1,  0 }, /* CR */
+        {  -1, -1, -1,  1,  1,  1,  1,  0 }, /* CW */
+        {  -1, -1, -1, -1,  1,  1,  1,  0 }, /* PR */
+        {  -1,  0,  0,  0,  0,  0,  1,  0 }, /* PW */
+        {  -1,  0,  0,  0,  0,  0,  0,  0 }, /* EX */
+        {  -1,  0,  0,  0,  0,  0,  0,  0 }  /* PD */
+};
+EXPORT_SYMBOL_GPL(dlm_lvb_operations);
+
+#define modes_compat(gr, rq) \
+       __dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
+
+int dlm_modes_compat(int mode1, int mode2)
+{
+       return __dlm_compat_matrix[mode1 + 1][mode2 + 1];
+}
+
+/*
+ * Compatibility matrix for conversions with QUECVT set.
+ * Granted mode is the row; requested mode is the column.
+ * Usage: matrix[grmode+1][rqmode+1]
+ */
+
+static const int __quecvt_compat_matrix[8][8] = {
+      /* UN NL CR CW PR PW EX PD */
+        {0, 0, 0, 0, 0, 0, 0, 0},       /* UN */
+        {0, 0, 1, 1, 1, 1, 1, 0},       /* NL */
+        {0, 0, 0, 1, 1, 1, 1, 0},       /* CR */
+        {0, 0, 0, 0, 1, 1, 1, 0},       /* CW */
+        {0, 0, 0, 1, 0, 1, 1, 0},       /* PR */
+        {0, 0, 0, 0, 0, 0, 1, 0},       /* PW */
+        {0, 0, 0, 0, 0, 0, 0, 0},       /* EX */
+        {0, 0, 0, 0, 0, 0, 0, 0}        /* PD */
+};
+
+static void dlm_print_lkb(struct dlm_lkb *lkb)
+{
+       printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x\n"
+              "     status %d rqmode %d grmode %d wait_type %d ast_type %d\n",
+              lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags,
+              lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode,
+              lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_ast_type);
+}
+
+void dlm_print_rsb(struct dlm_rsb *r)
+{
+       printk(KERN_ERR "rsb: nodeid %d flags %lx first %x rlc %d name %s\n",
+              r->res_nodeid, r->res_flags, r->res_first_lkid,
+              r->res_recover_locks_count, r->res_name);
+}
+
+/* Threads cannot use the lockspace while it's being recovered */
+
+static inline void lock_recovery(struct dlm_ls *ls)
+{
+       down_read(&ls->ls_in_recovery);
+}
+
+static inline void unlock_recovery(struct dlm_ls *ls)
+{
+       up_read(&ls->ls_in_recovery);
+}
+
+static inline int lock_recovery_try(struct dlm_ls *ls)
+{
+       return down_read_trylock(&ls->ls_in_recovery);
+}
+
+static inline int can_be_queued(struct dlm_lkb *lkb)
+{
+       return !(lkb->lkb_exflags & DLM_LKF_NOQUEUE);
+}
+
+static inline int force_blocking_asts(struct dlm_lkb *lkb)
+{
+       return (lkb->lkb_exflags & DLM_LKF_NOQUEUEBAST);
+}
+
+static inline int is_demoted(struct dlm_lkb *lkb)
+{
+       return (lkb->lkb_sbflags & DLM_SBF_DEMOTED);
+}
+
+static inline int is_remote(struct dlm_rsb *r)
+{
+       DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r););
+       return !!r->res_nodeid;
+}
+
+static inline int is_process_copy(struct dlm_lkb *lkb)
+{
+       return (lkb->lkb_nodeid && !(lkb->lkb_flags & DLM_IFL_MSTCPY));
+}
+
+static inline int is_master_copy(struct dlm_lkb *lkb)
+{
+       if (lkb->lkb_flags & DLM_IFL_MSTCPY)
+               DLM_ASSERT(lkb->lkb_nodeid, dlm_print_lkb(lkb););
+       return (lkb->lkb_flags & DLM_IFL_MSTCPY) ? TRUE : FALSE;
+}
+
+static inline int middle_conversion(struct dlm_lkb *lkb)
+{
+       if ((lkb->lkb_grmode==DLM_LOCK_PR && lkb->lkb_rqmode==DLM_LOCK_CW) ||
+           (lkb->lkb_rqmode==DLM_LOCK_PR && lkb->lkb_grmode==DLM_LOCK_CW))
+               return TRUE;
+       return FALSE;
+}
+
+static inline int down_conversion(struct dlm_lkb *lkb)
+{
+       return (!middle_conversion(lkb) && lkb->lkb_rqmode < lkb->lkb_grmode);
+}
+
+static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+       if (is_master_copy(lkb))
+               return;
+
+       DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb););
+
+       lkb->lkb_lksb->sb_status = rv;
+       lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
+
+       dlm_add_ast(lkb, AST_COMP);
+}
+
+static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
+{
+       if (is_master_copy(lkb))
+               send_bast(r, lkb, rqmode);
+       else {
+               lkb->lkb_bastmode = rqmode;
+               dlm_add_ast(lkb, AST_BAST);
+       }
+}
+
+/*
+ * Basic operations on rsb's and lkb's
+ */
+
+static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len)
+{
+       struct dlm_rsb *r;
+
+       r = allocate_rsb(ls, len);
+       if (!r)
+               return NULL;
+
+       r->res_ls = ls;
+       r->res_length = len;
+       memcpy(r->res_name, name, len);
+       init_MUTEX(&r->res_sem);
+
+       INIT_LIST_HEAD(&r->res_lookup);
+       INIT_LIST_HEAD(&r->res_grantqueue);
+       INIT_LIST_HEAD(&r->res_convertqueue);
+       INIT_LIST_HEAD(&r->res_waitqueue);
+       INIT_LIST_HEAD(&r->res_root_list);
+       INIT_LIST_HEAD(&r->res_recover_list);
+
+       return r;
+}
+
+static int search_rsb_list(struct list_head *head, char *name, int len,
+                          unsigned int flags, struct dlm_rsb **r_ret)
+{
+       struct dlm_rsb *r;
+       int error = 0;
+
+       list_for_each_entry(r, head, res_hashchain) {
+               if (len == r->res_length && !memcmp(name, r->res_name, len))
+                       goto found;
+       }
+       return -ENOENT;
+
+ found:
+       if (r->res_nodeid && (flags & R_MASTER))
+               error = -ENOTBLK;
+       *r_ret = r;
+       return error;
+}
+
+static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b,
+                      unsigned int flags, struct dlm_rsb **r_ret)
+{
+       struct dlm_rsb *r;
+       int error;
+
+       error = search_rsb_list(&ls->ls_rsbtbl[b].list, name, len, flags, &r);
+       if (!error) {
+               kref_get(&r->res_ref);
+               goto out;
+       }
+       error = search_rsb_list(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
+       if (error)
+               goto out;
+
+       list_move(&r->res_hashchain, &ls->ls_rsbtbl[b].list);
+
+       if (dlm_no_directory(ls))
+               goto out;
+
+       if (r->res_nodeid == -1) {
+               rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
+               r->res_first_lkid = 0;
+       } else if (r->res_nodeid > 0) {
+               rsb_set_flag(r, RSB_MASTER_UNCERTAIN);
+               r->res_first_lkid = 0;
+       } else {
+               DLM_ASSERT(r->res_nodeid == 0, dlm_print_rsb(r););
+               DLM_ASSERT(!rsb_flag(r, RSB_MASTER_UNCERTAIN),);
+       }
+ out:
+       *r_ret = r;
+       return error;
+}
+
+static int search_rsb(struct dlm_ls *ls, char *name, int len, int b,
+                     unsigned int flags, struct dlm_rsb **r_ret)
+{
+       int error;
+       write_lock(&ls->ls_rsbtbl[b].lock);
+       error = _search_rsb(ls, name, len, b, flags, r_ret);
+       write_unlock(&ls->ls_rsbtbl[b].lock);
+       return error;
+}
+
+/*
+ * Find rsb in rsbtbl and potentially create/add one
+ *
+ * Delaying the release of rsb's has a similar benefit to applications keeping
+ * NL locks on an rsb, but without the guarantee that the cached master value
+ * will still be valid when the rsb is reused.  Apps aren't always smart enough
+ * to keep NL locks on an rsb that they may lock again shortly; this can lead
+ * to excessive master lookups and removals if we don't delay the release.
+ *
+ * Searching for an rsb means looking through both the normal list and toss
+ * list.  When found on the toss list the rsb is moved to the normal list with
+ * ref count of 1; when found on normal list the ref count is incremented.
+ */
+
+static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
+                   unsigned int flags, struct dlm_rsb **r_ret)
+{
+       struct dlm_rsb *r, *tmp;
+       uint32_t hash, bucket;
+       int error = 0;
+
+       if (dlm_no_directory(ls))
+               flags |= R_CREATE;
+
+       hash = jhash(name, namelen, 0);
+       bucket = hash & (ls->ls_rsbtbl_size - 1);
+
+       error = search_rsb(ls, name, namelen, bucket, flags, &r);
+       if (!error)
+               goto out;
+
+       if (error == -ENOENT && !(flags & R_CREATE))
+               goto out;
+
+       /* the rsb was found but wasn't a master copy */
+       if (error == -ENOTBLK)
+               goto out;
+
+       error = -ENOMEM;
+       r = create_rsb(ls, name, namelen);
+       if (!r)
+               goto out;
+
+       r->res_hash = hash;
+       r->res_bucket = bucket;
+       r->res_nodeid = -1;
+       kref_init(&r->res_ref);
+
+       /* With no directory, the master can be set immediately */
+       if (dlm_no_directory(ls)) {
+               int nodeid = dlm_dir_nodeid(r);
+               if (nodeid == dlm_our_nodeid())
+                       nodeid = 0;
+               r->res_nodeid = nodeid;
+       }
+
+       write_lock(&ls->ls_rsbtbl[bucket].lock);
+       error = _search_rsb(ls, name, namelen, bucket, 0, &tmp);
+       if (!error) {
+               write_unlock(&ls->ls_rsbtbl[bucket].lock);
+               free_rsb(r);
+               r = tmp;
+               goto out;
+       }
+       list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
+       write_unlock(&ls->ls_rsbtbl[bucket].lock);
+       error = 0;
+ out:
+       *r_ret = r;
+       return error;
+}
+
+int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
+                unsigned int flags, struct dlm_rsb **r_ret)
+{
+       return find_rsb(ls, name, namelen, flags, r_ret);
+}
+
+/* This is only called to add a reference when the code already holds
+   a valid reference to the rsb, so there's no need for locking. */
+
+static inline void hold_rsb(struct dlm_rsb *r)
+{
+       kref_get(&r->res_ref);
+}
+
+void dlm_hold_rsb(struct dlm_rsb *r)
+{
+       hold_rsb(r);
+}
+
+static void toss_rsb(struct kref *kref)
+{
+       struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
+       struct dlm_ls *ls = r->res_ls;
+
+       DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r););
+       kref_init(&r->res_ref);
+       list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss);
+       r->res_toss_time = jiffies;
+       if (r->res_lvbptr) {
+               free_lvb(r->res_lvbptr);
+               r->res_lvbptr = NULL;
+       }
+}
+
+/* When all references to the rsb are gone it's transfered to
+   the tossed list for later disposal. */
+
+static void put_rsb(struct dlm_rsb *r)
+{
+       struct dlm_ls *ls = r->res_ls;
+       uint32_t bucket = r->res_bucket;
+
+       write_lock(&ls->ls_rsbtbl[bucket].lock);
+       kref_put(&r->res_ref, toss_rsb);
+       write_unlock(&ls->ls_rsbtbl[bucket].lock);
+}
+
+void dlm_put_rsb(struct dlm_rsb *r)
+{
+       put_rsb(r);
+}
+
+/* See comment for unhold_lkb */
+
+static void unhold_rsb(struct dlm_rsb *r)
+{
+       int rv;
+       rv = kref_put(&r->res_ref, toss_rsb);
+       DLM_ASSERT(!rv, dlm_print_rsb(r););
+}
+
+static void kill_rsb(struct kref *kref)
+{
+       struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
+
+       /* All work is done after the return from kref_put() so we
+          can release the write_lock before the remove and free. */
+
+       DLM_ASSERT(list_empty(&r->res_lookup),);
+       DLM_ASSERT(list_empty(&r->res_grantqueue),);
+       DLM_ASSERT(list_empty(&r->res_convertqueue),);
+       DLM_ASSERT(list_empty(&r->res_waitqueue),);
+       DLM_ASSERT(list_empty(&r->res_root_list),);
+       DLM_ASSERT(list_empty(&r->res_recover_list),);
+}
+
+/* Attaching/detaching lkb's from rsb's is for rsb reference counting.
+   The rsb must exist as long as any lkb's for it do. */
+
+static void attach_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+       hold_rsb(r);
+       lkb->lkb_resource = r;
+}
+
+static void detach_lkb(struct dlm_lkb *lkb)
+{
+       if (lkb->lkb_resource) {
+               put_rsb(lkb->lkb_resource);
+               lkb->lkb_resource = NULL;
+       }
+}
+
+static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
+{
+       struct dlm_lkb *lkb, *tmp;
+       uint32_t lkid = 0;
+       uint16_t bucket;
+
+       lkb = allocate_lkb(ls);
+       if (!lkb)
+               return -ENOMEM;
+
+       lkb->lkb_nodeid = -1;
+       lkb->lkb_grmode = DLM_LOCK_IV;
+       kref_init(&lkb->lkb_ref);
+
+       get_random_bytes(&bucket, sizeof(bucket));
+       bucket &= (ls->ls_lkbtbl_size - 1);
+
+       write_lock(&ls->ls_lkbtbl[bucket].lock);
+
+       /* counter can roll over so we must verify lkid is not in use */
+
+       while (lkid == 0) {
+               lkid = bucket | (ls->ls_lkbtbl[bucket].counter++ << 16);
+
+               list_for_each_entry(tmp, &ls->ls_lkbtbl[bucket].list,
+                                   lkb_idtbl_list) {
+                       if (tmp->lkb_id != lkid)
+                               continue;
+                       lkid = 0;
+                       break;
+               }
+       }
+
+       lkb->lkb_id = lkid;
+       list_add(&lkb->lkb_idtbl_list, &ls->ls_lkbtbl[bucket].list);
+       write_unlock(&ls->ls_lkbtbl[bucket].lock);
+
+       *lkb_ret = lkb;
+       return 0;
+}
+
+static struct dlm_lkb *__find_lkb(struct dlm_ls *ls, uint32_t lkid)
+{
+       uint16_t bucket = lkid & 0xFFFF;
+       struct dlm_lkb *lkb;
+
+       list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list) {
+               if (lkb->lkb_id == lkid)
+                       return lkb;
+       }
+       return NULL;
+}
+
+static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret)
+{
+       struct dlm_lkb *lkb;
+       uint16_t bucket = lkid & 0xFFFF;
+
+       if (bucket >= ls->ls_lkbtbl_size)
+               return -EBADSLT;
+
+       read_lock(&ls->ls_lkbtbl[bucket].lock);
+       lkb = __find_lkb(ls, lkid);
+       if (lkb)
+               kref_get(&lkb->lkb_ref);
+       read_unlock(&ls->ls_lkbtbl[bucket].lock);
+
+       *lkb_ret = lkb;
+       return lkb ? 0 : -ENOENT;
+}
+
+static void kill_lkb(struct kref *kref)
+{
+       struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref);
+
+       /* All work is done after the return from kref_put() so we
+          can release the write_lock before the detach_lkb */
+
+       DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
+}
+
+static int put_lkb(struct dlm_lkb *lkb)
+{
+       struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+       uint16_t bucket = lkb->lkb_id & 0xFFFF;
+
+       write_lock(&ls->ls_lkbtbl[bucket].lock);
+       if (kref_put(&lkb->lkb_ref, kill_lkb)) {
+               list_del(&lkb->lkb_idtbl_list);
+               write_unlock(&ls->ls_lkbtbl[bucket].lock);
+
+               detach_lkb(lkb);
+
+               /* for local/process lkbs, lvbptr points to caller's lksb */
+               if (lkb->lkb_lvbptr && is_master_copy(lkb))
+                       free_lvb(lkb->lkb_lvbptr);
+               if (lkb->lkb_range)
+                       free_range(lkb->lkb_range);
+               free_lkb(lkb);
+               return 1;
+       } else {
+               write_unlock(&ls->ls_lkbtbl[bucket].lock);
+               return 0;
+       }
+}
+
+int dlm_put_lkb(struct dlm_lkb *lkb)
+{
+       return put_lkb(lkb);
+}
+
+/* This is only called to add a reference when the code already holds
+   a valid reference to the lkb, so there's no need for locking. */
+
+static inline void hold_lkb(struct dlm_lkb *lkb)
+{
+       kref_get(&lkb->lkb_ref);
+}
+
+/* This is called when we need to remove a reference and are certain
+   it's not the last ref.  e.g. del_lkb is always called between a
+   find_lkb/put_lkb and is always the inverse of a previous add_lkb.
+   put_lkb would work fine, but would involve unnecessary locking */
+
+static inline void unhold_lkb(struct dlm_lkb *lkb)
+{
+       int rv;
+       rv = kref_put(&lkb->lkb_ref, kill_lkb);
+       DLM_ASSERT(!rv, dlm_print_lkb(lkb););
+}
+
+static void lkb_add_ordered(struct list_head *new, struct list_head *head,
+                           int mode)
+{
+       struct dlm_lkb *lkb = NULL;
+
+       list_for_each_entry(lkb, head, lkb_statequeue)
+               if (lkb->lkb_rqmode < mode)
+                       break;
+
+       if (!lkb)
+               list_add_tail(new, head);
+       else
+               __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
+}
+
+/* add/remove lkb to rsb's grant/convert/wait queue */
+
+static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status)
+{
+       kref_get(&lkb->lkb_ref);
+
+       DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
+
+       lkb->lkb_status = status;
+
+       switch (status) {
+       case DLM_LKSTS_WAITING:
+               if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
+                       list_add(&lkb->lkb_statequeue, &r->res_waitqueue);
+               else
+                       list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
+               break;
+       case DLM_LKSTS_GRANTED:
+               /* convention says granted locks kept in order of grmode */
+               lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
+                               lkb->lkb_grmode);
+               break;
+       case DLM_LKSTS_CONVERT:
+               if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
+                       list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
+               else
+                       list_add_tail(&lkb->lkb_statequeue,
+                                     &r->res_convertqueue);
+               break;
+       default:
+               DLM_ASSERT(0, dlm_print_lkb(lkb); printk("sts=%d\n", status););
+       }
+}
+
+static void del_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+       lkb->lkb_status = 0;
+       list_del(&lkb->lkb_statequeue);
+       unhold_lkb(lkb);
+}
+
+static void move_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int sts)
+{
+       hold_lkb(lkb);
+       del_lkb(r, lkb);
+       add_lkb(r, lkb, sts);
+       unhold_lkb(lkb);
+}
+
+/* add/remove lkb from global waiters list of lkb's waiting for
+   a reply from a remote node */
+
+static void add_to_waiters(struct dlm_lkb *lkb, int mstype)
+{
+       struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+
+       down(&ls->ls_waiters_sem);
+       if (lkb->lkb_wait_type) {
+               log_print("add_to_waiters error %d", lkb->lkb_wait_type);
+               goto out;
+       }
+       lkb->lkb_wait_type = mstype;
+       kref_get(&lkb->lkb_ref);
+       list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
+ out:
+       up(&ls->ls_waiters_sem);
+}
+
+static int _remove_from_waiters(struct dlm_lkb *lkb)
+{
+       int error = 0;
+
+       if (!lkb->lkb_wait_type) {
+               log_print("remove_from_waiters error");
+               error = -EINVAL;
+               goto out;
+       }
+       lkb->lkb_wait_type = 0;
+       list_del(&lkb->lkb_wait_reply);
+       unhold_lkb(lkb);
+ out:
+       return error;
+}
+
+static int remove_from_waiters(struct dlm_lkb *lkb)
+{
+       struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+       int error;
+
+       down(&ls->ls_waiters_sem);
+       error = _remove_from_waiters(lkb);
+       up(&ls->ls_waiters_sem);
+       return error;
+}
+
+static void dir_remove(struct dlm_rsb *r)
+{
+       int to_nodeid;
+
+       if (dlm_no_directory(r->res_ls))
+               return;
+
+       to_nodeid = dlm_dir_nodeid(r);
+       if (to_nodeid != dlm_our_nodeid())
+               send_remove(r);
+       else
+               dlm_dir_remove_entry(r->res_ls, to_nodeid,
+                                    r->res_name, r->res_length);
+}
+
+/* FIXME: shouldn't this be able to exit as soon as one non-due rsb is
+   found since they are in order of newest to oldest? */
+
+static int shrink_bucket(struct dlm_ls *ls, int b)
+{
+       struct dlm_rsb *r;
+       int count = 0, found;
+
+       for (;;) {
+               found = FALSE;
+               write_lock(&ls->ls_rsbtbl[b].lock);
+               list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss,
+                                           res_hashchain) {
+                       if (!time_after_eq(jiffies, r->res_toss_time +
+                                          dlm_config.toss_secs * HZ))
+                               continue;
+                       found = TRUE;
+                       break;
+               }
+
+               if (!found) {
+                       write_unlock(&ls->ls_rsbtbl[b].lock);
+                       break;
+               }
+
+               if (kref_put(&r->res_ref, kill_rsb)) {
+                       list_del(&r->res_hashchain);
+                       write_unlock(&ls->ls_rsbtbl[b].lock);
+
+                       if (is_master(r))
+                               dir_remove(r);
+                       free_rsb(r);
+                       count++;
+               } else {
+                       write_unlock(&ls->ls_rsbtbl[b].lock);
+                       log_error(ls, "tossed rsb in use %s", r->res_name);
+               }
+       }
+
+       return count;
+}
+
+void dlm_scan_rsbs(struct dlm_ls *ls)
+{
+       int i;
+
+       if (dlm_locking_stopped(ls))
+               return;
+
+       for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+               shrink_bucket(ls, i);
+               cond_resched();
+       }
+}
+
+/* lkb is master or local copy */
+
+static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+       int b, len = r->res_ls->ls_lvblen;
+
+       /* b=1 lvb returned to caller
+          b=0 lvb written to rsb or invalidated
+          b=-1 do nothing */
+
+       b =  dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
+
+       if (b == 1) {
+               if (!lkb->lkb_lvbptr)
+                       return;
+
+               if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+                       return;
+
+               if (!r->res_lvbptr)
+                       return;
+
+               memcpy(lkb->lkb_lvbptr, r->res_lvbptr, len);
+               lkb->lkb_lvbseq = r->res_lvbseq;
+
+       } else if (b == 0) {
+               if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
+                       rsb_set_flag(r, RSB_VALNOTVALID);
+                       return;
+               }
+
+               if (!lkb->lkb_lvbptr)
+                       return;
+
+               if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+                       return;
+
+               if (!r->res_lvbptr)
+                       r->res_lvbptr = allocate_lvb(r->res_ls);
+
+               if (!r->res_lvbptr)
+                       return;
+
+               memcpy(r->res_lvbptr, lkb->lkb_lvbptr, len);
+               r->res_lvbseq++;
+               lkb->lkb_lvbseq = r->res_lvbseq;
+               rsb_clear_flag(r, RSB_VALNOTVALID);
+       }
+
+       if (rsb_flag(r, RSB_VALNOTVALID))
+               lkb->lkb_sbflags |= DLM_SBF_VALNOTVALID;
+}
+
+static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+       if (lkb->lkb_grmode < DLM_LOCK_PW)
+               return;
+
+       if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
+               rsb_set_flag(r, RSB_VALNOTVALID);
+               return;
+       }
+
+       if (!lkb->lkb_lvbptr)
+               return;
+
+       if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+               return;
+
+       if (!r->res_lvbptr)
+               r->res_lvbptr = allocate_lvb(r->res_ls);
+
+       if (!r->res_lvbptr)
+               return;
+
+       memcpy(r->res_lvbptr, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
+       r->res_lvbseq++;
+       rsb_clear_flag(r, RSB_VALNOTVALID);
+}
+
+/* lkb is process copy (pc) */
+
+static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
+                           struct dlm_message *ms)
+{
+       int b;
+
+       if (!lkb->lkb_lvbptr)
+               return;
+
+       if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+               return;
+
+       b =  dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
+       if (b == 1) {
+               int len = receive_extralen(ms);
+               memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
+               lkb->lkb_lvbseq = ms->m_lvbseq;
+       }
+}
+
+/* Manipulate lkb's on rsb's convert/granted/waiting queues
+   remove_lock -- used for unlock, removes lkb from granted
+   revert_lock -- used for cancel, moves lkb from convert to granted
+   grant_lock  -- used for request and convert, adds lkb to granted or
+                  moves lkb from convert or waiting to granted
+
+   Each of these is used for master or local copy lkb's.  There is
+   also a _pc() variation used to make the corresponding change on
+   a process copy (pc) lkb. */
+
+static void _remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+       del_lkb(r, lkb);
+       lkb->lkb_grmode = DLM_LOCK_IV;
+       /* this unhold undoes the original ref from create_lkb()
+          so this leads to the lkb being freed */
+       unhold_lkb(lkb);
+}
+
+static void remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+       set_lvb_unlock(r, lkb);
+       _remove_lock(r, lkb);
+}
+
+static void remove_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+       _remove_lock(r, lkb);
+}
+
+static void revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+       lkb->lkb_rqmode = DLM_LOCK_IV;
+
+       switch (lkb->lkb_status) {
+       case DLM_LKSTS_CONVERT:
+               move_lkb(r, lkb, DLM_LKSTS_GRANTED);
+               break;
+       case DLM_LKSTS_WAITING:
+               del_lkb(r, lkb);
+               lkb->lkb_grmode = DLM_LOCK_IV;
+               /* this unhold undoes the original ref from create_lkb()
+                  so this leads to the lkb being freed */
+               unhold_lkb(lkb);
+               break;
+       default:
+               log_print("invalid status for revert %d", lkb->lkb_status);
+       }
+}
+
+static void revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+       revert_lock(r, lkb);
+}
+
+static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+       if (lkb->lkb_grmode != lkb->lkb_rqmode) {
+               lkb->lkb_grmode = lkb->lkb_rqmode;
+               if (lkb->lkb_status)
+                       move_lkb(r, lkb, DLM_LKSTS_GRANTED);
+               else
+                       add_lkb(r, lkb, DLM_LKSTS_GRANTED);
+       }
+
+       lkb->lkb_rqmode = DLM_LOCK_IV;
+
+       if (lkb->lkb_range) {
+               lkb->lkb_range[GR_RANGE_START] = lkb->lkb_range[RQ_RANGE_START];
+               lkb->lkb_range[GR_RANGE_END] = lkb->lkb_range[RQ_RANGE_END];
+       }
+}
+
+static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+       set_lvb_lock(r, lkb);
+       _grant_lock(r, lkb);
+       lkb->lkb_highbast = 0;
+}
+
+static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
+                         struct dlm_message *ms)
+{
+       set_lvb_lock_pc(r, lkb, ms);
+       _grant_lock(r, lkb);
+}
+
+/* called by grant_pending_locks() which means an async grant message must
+   be sent to the requesting node in addition to granting the lock if the
+   lkb belongs to a remote node. */
+
+static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+       grant_lock(r, lkb);
+       if (is_master_copy(lkb))
+               send_grant(r, lkb);
+       else
+               queue_cast(r, lkb, 0);
+}
+
+static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
+{
+       struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb,
+                                          lkb_statequeue);
+       if (lkb->lkb_id == first->lkb_id)
+               return TRUE;
+
+       return FALSE;
+}
+
+/* Return 1 if the locks' ranges overlap.  If the lkb has no range then it is
+   assumed to cover 0-ffffffff.ffffffff */
+
+static inline int ranges_overlap(struct dlm_lkb *lkb1, struct dlm_lkb *lkb2)
+{
+       if (!lkb1->lkb_range || !lkb2->lkb_range)
+               return TRUE;
+
+       if (lkb1->lkb_range[RQ_RANGE_END] < lkb2->lkb_range[GR_RANGE_START] ||
+           lkb1->lkb_range[RQ_RANGE_START] > lkb2->lkb_range[GR_RANGE_END])
+               return FALSE;
+
+       return TRUE;
+}
+
+/* Check if the given lkb conflicts with another lkb on the queue. */
+
+static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb)
+{
+       struct dlm_lkb *this;
+
+       list_for_each_entry(this, head, lkb_statequeue) {
+               if (this == lkb)
+                       continue;
+               if (ranges_overlap(lkb, this) && !modes_compat(this, lkb))
+                       return TRUE;
+       }
+       return FALSE;
+}
+
+/*
+ * "A conversion deadlock arises with a pair of lock requests in the converting
+ * queue for one resource.  The granted mode of each lock blocks the requested
+ * mode of the other lock."
+ *
+ * Part 2: if the granted mode of lkb is preventing the first lkb in the
+ * convert queue from being granted, then demote lkb (set grmode to NL).
+ * This second form requires that we check for conv-deadlk even when
+ * now == 0 in _can_be_granted().
+ *
+ * Example:
+ * Granted Queue: empty
+ * Convert Queue: NL->EX (first lock)
+ *                PR->EX (second lock)
+ *
+ * The first lock can't be granted because of the granted mode of the second
+ * lock and the second lock can't be granted because it's not first in the
+ * list.  We demote the granted mode of the second lock (the lkb passed to this
+ * function).
+ *
+ * After the resolution, the "grant pending" function needs to go back and try
+ * to grant locks on the convert queue again since the first lock can now be
+ * granted.
+ */
+
+static int conversion_deadlock_detect(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
+{
+       struct dlm_lkb *this, *first = NULL, *self = NULL;
+
+       list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
+               if (!first)
+                       first = this;
+               if (this == lkb) {
+                       self = lkb;
+                       continue;
+               }
+
+               if (!ranges_overlap(lkb, this))
+                       continue;
+
+               if (!modes_compat(this, lkb) && !modes_compat(lkb, this))
+                       return TRUE;
+       }
+
+       /* if lkb is on the convert queue and is preventing the first
+          from being granted, then there's deadlock and we demote lkb.
+          multiple converting locks may need to do this before the first
+          converting lock can be granted. */
+
+       if (self && self != first) {
+               if (!modes_compat(lkb, first) &&
+                   !queue_conflict(&rsb->res_grantqueue, first))
+                       return TRUE;
+       }
+
+       return FALSE;
+}
+
+/*
+ * Return 1 if the lock can be granted, 0 otherwise.
+ * Also detect and resolve conversion deadlocks.
+ *
+ * lkb is the lock to be granted
+ *
+ * now is 1 if the function is being called in the context of the
+ * immediate request, it is 0 if called later, after the lock has been
+ * queued.
+ *
+ * References are from chapter 6 of "VAXcluster Principles" by Roy Davis
+ */
+
+static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
+{
+       int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV);
+
+       /*
+        * 6-10: Version 5.4 introduced an option to address the phenomenon of
+        * a new request for a NL mode lock being blocked.
+        *
+        * 6-11: If the optional EXPEDITE flag is used with the new NL mode
+        * request, then it would be granted.  In essence, the use of this flag
+        * tells the Lock Manager to expedite theis request by not considering
+        * what may be in the CONVERTING or WAITING queues...  As of this
+        * writing, the EXPEDITE flag can be used only with new requests for NL
+        * mode locks.  This flag is not valid for conversion requests.
+        *
+        * A shortcut.  Earlier checks return an error if EXPEDITE is used in a
+        * conversion or used with a non-NL requested mode.  We also know an
+        * EXPEDITE request is always granted immediately, so now must always
+        * be 1.  The full condition to grant an expedite request: (now &&
+        * !conv && lkb->rqmode == DLM_LOCK_NL && (flags & EXPEDITE)) can
+        * therefore be shortened to just checking the flag.
+        */
+
+       if (lkb->lkb_exflags & DLM_LKF_EXPEDITE)
+               return TRUE;
+
+       /*
+        * A shortcut. Without this, !queue_conflict(grantqueue, lkb) would be
+        * added to the remaining conditions.
+        */
+
+       if (queue_conflict(&r->res_grantqueue, lkb))
+               goto out;
+
+       /*
+        * 6-3: By default, a conversion request is immediately granted if the
+        * requested mode is compatible with the modes of all other granted
+        * locks
+        */
+
+       if (queue_conflict(&r->res_convertqueue, lkb))
+               goto out;
+
+       /*
+        * 6-5: But the default algorithm for deciding whether to grant or
+        * queue conversion requests does not by itself guarantee that such
+        * requests are serviced on a "first come first serve" basis.  This, in
+        * turn, can lead to a phenomenon known as "indefinate postponement".
+        *
+        * 6-7: This issue is dealt with by using the optional QUECVT flag with
+        * the system service employed to request a lock conversion.  This flag
+        * forces certain conversion requests to be queued, even if they are
+        * compatible with the granted modes of other locks on the same
+        * resource.  Thus, the use of this flag results in conversion requests
+        * being ordered on a "first come first servce" basis.
+        *
+        * DCT: This condition is all about new conversions being able to occur
+        * "in place" while the lock remains on the granted queue (assuming
+        * nothing else conflicts.)  IOW if QUECVT isn't set, a conversion
+        * doesn't _have_ to go onto the convert queue where it's processed in
+        * order.  The "now" variable is necessary to distinguish converts
+        * being received and processed for the first time now, because once a
+        * convert is moved to the conversion queue the condition below applies
+        * requiring fifo granting.
+        */
+
+       if (now && conv && !(lkb->lkb_exflags & DLM_LKF_QUECVT))
+               return TRUE;
+
+       /*
+        * When using range locks the NOORDER flag is set to avoid the standard
+        * vms rules on grant order.
+        */
+
+       if (lkb->lkb_exflags & DLM_LKF_NOORDER)
+               return TRUE;
+
+       /*
+        * 6-3: Once in that queue [CONVERTING], a conversion request cannot be
+        * granted until all other conversion requests ahead of it are granted
+        * and/or canceled.
+        */
+
+       if (!now && conv && first_in_list(lkb, &r->res_convertqueue))
+               return TRUE;
+
+       /*
+        * 6-4: By default, a new request is immediately granted only if all
+        * three of the following conditions are satisfied when the request is
+        * issued:
+        * - The queue of ungranted conversion requests for the resource is
+        *   empty.
+        * - The queue of ungranted new requests for the resource is empty.
+        * - The mode of the new request is compatible with the most
+        *   restrictive mode of all granted locks on the resource.
+        */
+
+       if (now && !conv && list_empty(&r->res_convertqueue) &&
+           list_empty(&r->res_waitqueue))
+               return TRUE;
+
+       /*
+        * 6-4: Once a lock request is in the queue of ungranted new requests,
+        * it cannot be granted until the queue of ungranted conversion
+        * requests is empty, all ungranted new requests ahead of it are
+        * granted and/or canceled, and it is compatible with the granted mode
+        * of the most restrictive lock granted on the resource.
+        */
+
+       if (!now && !conv && list_empty(&r->res_convertqueue) &&
+           first_in_list(lkb, &r->res_waitqueue))
+               return TRUE;
+
+ out:
+       /*
+        * The following, enabled by CONVDEADLK, departs from VMS.
+        */
+
+       if (conv && (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) &&
+           conversion_deadlock_detect(r, lkb)) {
+               lkb->lkb_grmode = DLM_LOCK_NL;
+               lkb->lkb_sbflags |= DLM_SBF_DEMOTED;
+       }
+
+       return FALSE;
+}
+
+/*
+ * The ALTPR and ALTCW flags aren't traditional lock manager flags, but are a
+ * simple way to provide a big optimization to applications that can use them.
+ */
+
+static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
+{
+       uint32_t flags = lkb->lkb_exflags;
+       int rv;
+       int8_t alt = 0, rqmode = lkb->lkb_rqmode;
+
+       rv = _can_be_granted(r, lkb, now);
+       if (rv)
+               goto out;
+
+       if (lkb->lkb_sbflags & DLM_SBF_DEMOTED)
+               goto out;
+
+       if (rqmode != DLM_LOCK_PR && flags & DLM_LKF_ALTPR)
+               alt = DLM_LOCK_PR;
+       else if (rqmode != DLM_LOCK_CW && flags & DLM_LKF_ALTCW)
+               alt = DLM_LOCK_CW;
+
+       if (alt) {
+               lkb->lkb_rqmode = alt;
+               rv = _can_be_granted(r, lkb, now);
+               if (rv)
+                       lkb->lkb_sbflags |= DLM_SBF_ALTMODE;
+               else
+                       lkb->lkb_rqmode = rqmode;
+       }
+ out:
+       return rv;
+}
+
+static int grant_pending_convert(struct dlm_rsb *r, int high)
+{
+       struct dlm_lkb *lkb, *s;
+       int hi, demoted, quit, grant_restart, demote_restart;
+
+       quit = 0;
+ restart:
+       grant_restart = 0;
+       demote_restart = 0;
+       hi = DLM_LOCK_IV;
+
+       list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) {
+               demoted = is_demoted(lkb);
+               if (can_be_granted(r, lkb, FALSE)) {
+                       grant_lock_pending(r, lkb);
+                       grant_restart = 1;
+               } else {
+                       hi = max_t(int, lkb->lkb_rqmode, hi);
+                       if (!demoted && is_demoted(lkb))
+                               demote_restart = 1;
+               }
+       }
+
+       if (grant_restart)
+               goto restart;
+       if (demote_restart && !quit) {
+               quit = 1;
+               goto restart;
+       }
+
+       return max_t(int, high, hi);
+}
+
+static int grant_pending_wait(struct dlm_rsb *r, int high)
+{
+       struct dlm_lkb *lkb, *s;
+
+       list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) {
+               if (can_be_granted(r, lkb, FALSE))
+                       grant_lock_pending(r, lkb);
+                else
+                       high = max_t(int, lkb->lkb_rqmode, high);
+       }
+
+       return high;
+}
+
+static void grant_pending_locks(struct dlm_rsb *r)
+{
+       struct dlm_lkb *lkb, *s;
+       int high = DLM_LOCK_IV;
+
+       DLM_ASSERT(is_master(r), dlm_print_rsb(r););
+
+       high = grant_pending_convert(r, high);
+       high = grant_pending_wait(r, high);
+
+       if (high == DLM_LOCK_IV)
+               return;
+
+       /*
+        * If there are locks left on the wait/convert queue then send blocking
+        * ASTs to granted locks based on the largest requested mode (high)
+        * found above.  This can generate spurious blocking ASTs for range
+        * locks. FIXME: highbast < high comparison not valid for PR/CW.
+        */
+
+       list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) {
+               if (lkb->lkb_bastaddr && (lkb->lkb_highbast < high) &&
+                   !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) {
+                       queue_bast(r, lkb, high);
+                       lkb->lkb_highbast = high;
+               }
+       }
+}
+
+static void send_bast_queue(struct dlm_rsb *r, struct list_head *head,
+                           struct dlm_lkb *lkb)
+{
+       struct dlm_lkb *gr;
+
+       list_for_each_entry(gr, head, lkb_statequeue) {
+               if (gr->lkb_bastaddr &&
+                   gr->lkb_highbast < lkb->lkb_rqmode &&
+                   ranges_overlap(lkb, gr) && !modes_compat(gr, lkb)) {
+                       queue_bast(r, gr, lkb->lkb_rqmode);
+                       gr->lkb_highbast = lkb->lkb_rqmode;
+               }
+       }
+}
+
+static void send_blocking_asts(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+       send_bast_queue(r, &r->res_grantqueue, lkb);
+}
+
+static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+       send_bast_queue(r, &r->res_grantqueue, lkb);
+       send_bast_queue(r, &r->res_convertqueue, lkb);
+}
+
+/* set_master(r, lkb) -- set the master nodeid of a resource
+
+   The purpose of this function is to set the nodeid field in the given
+   lkb using the nodeid field in the given rsb.  If the rsb's nodeid is
+   known, it can just be copied to the lkb and the function will return
+   0.  If the rsb's nodeid is _not_ known, it needs to be looked up
+   before it can be copied to the lkb.
+
+   When the rsb nodeid is being looked up remotely, the initial lkb
+   causing the lookup is kept on the ls_waiters list waiting for the
+   lookup reply.  Other lkb's waiting for the same rsb lookup are kept
+   on the rsb's res_lookup list until the master is verified.
+
+   Return values:
+   0: nodeid is set in rsb/lkb and the caller should go ahead and use it
+   1: the rsb master is not available and the lkb has been placed on
+      a wait queue
+*/
+
+static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+       struct dlm_ls *ls = r->res_ls;
+       int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
+
+       if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) {
+               rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
+               r->res_first_lkid = lkb->lkb_id;
+               lkb->lkb_nodeid = r->res_nodeid;
+               return 0;
+       }
+
+       if (r->res_first_lkid && r->res_first_lkid != lkb->lkb_id) {
+               list_add_tail(&lkb->lkb_rsb_lookup, &r->res_lookup);
+               return 1;
+       }
+
+       if (r->res_nodeid == 0) {
+               lkb->lkb_nodeid = 0;
+               return 0;
+       }
+
+       if (r->res_nodeid > 0) {
+               lkb->lkb_nodeid = r->res_nodeid;
+               return 0;
+       }
+
+       DLM_ASSERT(r->res_nodeid == -1, dlm_print_rsb(r););
+
+       dir_nodeid = dlm_dir_nodeid(r);
+
+       if (dir_nodeid != our_nodeid) {
+               r->res_first_lkid = lkb->lkb_id;
+               send_lookup(r, lkb);
+               return 1;
+       }
+
+       for (;;) {
+               /* It's possible for dlm_scand to remove an old rsb for
+                  this same resource from the toss list, us to create
+                  a new one, look up the master locally, and find it
+                  already exists just before dlm_scand does the
+                  dir_remove() on the previous rsb. */
+
+               error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
+                                      r->res_length, &ret_nodeid);
+               if (!error)
+                       break;
+               log_debug(ls, "dir_lookup error %d %s", error, r->res_name);
+               schedule();
+       }
+
+       if (ret_nodeid == our_nodeid) {
+               r->res_first_lkid = 0;
+               r->res_nodeid = 0;
+               lkb->lkb_nodeid = 0;
+       } else {
+               r->res_first_lkid = lkb->lkb_id;
+               r->res_nodeid = ret_nodeid;
+               lkb->lkb_nodeid = ret_nodeid;
+       }
+       return 0;
+}
+
+static void process_lookup_list(struct dlm_rsb *r)
+{
+       struct dlm_lkb *lkb, *safe;
+
+       list_for_each_entry_safe(lkb, safe, &r->res_lookup, lkb_rsb_lookup) {
+               list_del(&lkb->lkb_rsb_lookup);
+               _request_lock(r, lkb);
+               schedule();
+       }
+}
+
+/* confirm_master -- confirm (or deny) an rsb's master nodeid */
+
+static void confirm_master(struct dlm_rsb *r, int error)
+{
+       struct dlm_lkb *lkb;
+
+       if (!r->res_first_lkid)
+               return;
+
+       switch (error) {
+       case 0:
+       case -EINPROGRESS:
+               r->res_first_lkid = 0;
+               process_lookup_list(r);
+               break;
+
+       case -EAGAIN:
+               /* the remote master didn't queue our NOQUEUE request;
+                  make a waiting lkb the first_lkid */
+
+               r->res_first_lkid = 0;
+
+               if (!list_empty(&r->res_lookup)) {
+                       lkb = list_entry(r->res_lookup.next, struct dlm_lkb,
+                                        lkb_rsb_lookup);
+                       list_del(&lkb->lkb_rsb_lookup);
+                       r->res_first_lkid = lkb->lkb_id;
+                       _request_lock(r, lkb);
+               } else
+                       r->res_nodeid = -1;
+               break;
+
+       default:
+               log_error(r->res_ls, "confirm_master unknown error %d", error);
+       }
+}
+
+static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
+                        int namelen, uint32_t parent_lkid, void *ast,
+                        void *astarg, void *bast, struct dlm_range *range,
+                        struct dlm_args *args)
+{
+       int rv = -EINVAL;
+
+       /* check for invalid arg usage */
+
+       if (mode < 0 || mode > DLM_LOCK_EX)
+               goto out;
+
+       if (!(flags & DLM_LKF_CONVERT) && (namelen > DLM_RESNAME_MAXLEN))
+               goto out;
+
+       if (flags & DLM_LKF_CANCEL)
+               goto out;
+
+       if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
+               goto out;
+
+       if (flags & DLM_LKF_CONVDEADLK && !(flags & DLM_LKF_CONVERT))
+               goto out;
+
+       if (flags & DLM_LKF_CONVDEADLK && flags & DLM_LKF_NOQUEUE)
+               goto out;
+
+       if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_CONVERT)
+               goto out;
+
+       if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
+               goto out;
+
+       if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
+               goto out;
+
+       if (flags & DLM_LKF_EXPEDITE && mode != DLM_LOCK_NL)
+               goto out;
+
+       if (!ast || !lksb)
+               goto out;
+
+       if (flags & DLM_LKF_VALBLK && !lksb->sb_lvbptr)
+               goto out;
+
+       /* parent/child locks not yet supported */
+       if (parent_lkid)
+               goto out;
+
+       if (flags & DLM_LKF_CONVERT && !lksb->sb_lkid)
+               goto out;
+
+       /* these args will be copied to the lkb in validate_lock_args,
+          it cannot be done now because when converting locks, fields in
+          an active lkb cannot be modified before locking the rsb */
+
+       args->flags = flags;
+       args->astaddr = ast;
+       args->astparam = (long) astarg;
+       args->bastaddr = bast;
+       args->mode = mode;
+       args->lksb = lksb;
+       args->range = range;
+       rv = 0;
+ out:
+       return rv;
+}
+
+static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args)
+{
+       if (flags & ~(DLM_LKF_CANCEL | DLM_LKF_VALBLK | DLM_LKF_IVVALBLK |
+                     DLM_LKF_FORCEUNLOCK))
+               return -EINVAL;
+
+       args->flags = flags;
+       args->astparam = (long) astarg;
+       return 0;
+}
+
+static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                             struct dlm_args *args)
+{
+       int rv = -EINVAL;
+
+       if (args->flags & DLM_LKF_CONVERT) {
+               if (lkb->lkb_flags & DLM_IFL_MSTCPY)
+                       goto out;
+
+               if (args->flags & DLM_LKF_QUECVT &&
+                   !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
+                       goto out;
+
+               rv = -EBUSY;
+               if (lkb->lkb_status != DLM_LKSTS_GRANTED)
+                       goto out;
+
+               if (lkb->lkb_wait_type)
+                       goto out;
+       }
+
+       lkb->lkb_exflags = args->flags;
+       lkb->lkb_sbflags = 0;
+       lkb->lkb_astaddr = args->astaddr;
+       lkb->lkb_astparam = args->astparam;
+       lkb->lkb_bastaddr = args->bastaddr;
+       lkb->lkb_rqmode = args->mode;
+       lkb->lkb_lksb = args->lksb;
+       lkb->lkb_lvbptr = args->lksb->sb_lvbptr;
+       lkb->lkb_ownpid = (int) current->pid;
+
+       rv = 0;
+       if (!args->range)
+               goto out;
+
+       if (!lkb->lkb_range) {
+               rv = -ENOMEM;
+               lkb->lkb_range = allocate_range(ls);
+               if (!lkb->lkb_range)
+                       goto out;
+               /* This is needed for conversions that contain ranges
+                  where the original lock didn't but it's harmless for
+                  new locks too. */
+               lkb->lkb_range[GR_RANGE_START] = 0LL;
+               lkb->lkb_range[GR_RANGE_END] = 0xffffffffffffffffULL;
+       }
+
+       lkb->lkb_range[RQ_RANGE_START] = args->range->ra_start;
+       lkb->lkb_range[RQ_RANGE_END] = args->range->ra_end;
+       lkb->lkb_flags |= DLM_IFL_RANGE;
+       rv = 0;
+ out:
+       return rv;
+}
+
+static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
+{
+       int rv = -EINVAL;
+
+       if (lkb->lkb_flags & DLM_IFL_MSTCPY)
+               goto out;
+
+       if (args->flags & DLM_LKF_FORCEUNLOCK)
+               goto out_ok;
+
+       if (args->flags & DLM_LKF_CANCEL &&
+           lkb->lkb_status == DLM_LKSTS_GRANTED)
+               goto out;
+
+       if (!(args->flags & DLM_LKF_CANCEL) &&
+           lkb->lkb_status != DLM_LKSTS_GRANTED)
+               goto out;
+
+       rv = -EBUSY;
+       if (lkb->lkb_wait_type)
+               goto out;
+
+ out_ok:
+       lkb->lkb_exflags = args->flags;
+       lkb->lkb_sbflags = 0;
+       lkb->lkb_astparam = args->astparam;
+
+       rv = 0;
+ out:
+       return rv;
+}
+
+/*
+ * Four stage 4 varieties:
+ * do_request(), do_convert(), do_unlock(), do_cancel()
+ * These are called on the master node for the given lock and
+ * from the central locking logic.
+ */
+
+static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+       int error = 0;
+
+       if (can_be_granted(r, lkb, TRUE)) {
+               grant_lock(r, lkb);
+               queue_cast(r, lkb, 0);
+               goto out;
+       }
+
+       if (can_be_queued(lkb)) {
+               error = -EINPROGRESS;
+               add_lkb(r, lkb, DLM_LKSTS_WAITING);
+               send_blocking_asts(r, lkb);
+               goto out;
+       }
+
+       error = -EAGAIN;
+       if (force_blocking_asts(lkb))
+               send_blocking_asts_all(r, lkb);
+       queue_cast(r, lkb, -EAGAIN);
+
+ out:
+       return error;
+}
+
+static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+       int error = 0;
+
+       /* changing an existing lock may allow others to be granted */
+
+       if (can_be_granted(r, lkb, TRUE)) {
+               grant_lock(r, lkb);
+               queue_cast(r, lkb, 0);
+               grant_pending_locks(r);
+               goto out;
+       }
+
+       if (can_be_queued(lkb)) {
+               if (is_demoted(lkb))
+                       grant_pending_locks(r);
+               error = -EINPROGRESS;
+               del_lkb(r, lkb);
+               add_lkb(r, lkb, DLM_LKSTS_CONVERT);
+               send_blocking_asts(r, lkb);
+               goto out;
+       }
+
+       error = -EAGAIN;
+       if (force_blocking_asts(lkb))
+               send_blocking_asts_all(r, lkb);
+       queue_cast(r, lkb, -EAGAIN);
+
+ out:
+       return error;
+}
+
+static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+       remove_lock(r, lkb);
+       queue_cast(r, lkb, -DLM_EUNLOCK);
+       grant_pending_locks(r);
+       return -DLM_EUNLOCK;
+}
+
+static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+       revert_lock(r, lkb);
+       queue_cast(r, lkb, -DLM_ECANCEL);
+       grant_pending_locks(r);
+       return -DLM_ECANCEL;
+}
+
+/*
+ * Four stage 3 varieties:
+ * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
+ */
+
+/* add a new lkb to a possibly new rsb, called by requesting process */
+
+static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+       int error;
+
+       /* set_master: sets lkb nodeid from r */
+
+       error = set_master(r, lkb);
+       if (error < 0)
+               goto out;
+       if (error) {
+               error = 0;
+               goto out;
+       }
+
+       if (is_remote(r))
+               /* receive_request() calls do_request() on remote node */
+               error = send_request(r, lkb);
+       else
+               error = do_request(r, lkb);
+ out:
+       return error;
+}
+
+/* change some property of an existing lkb, e.g. mode, range */
+
+static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+       int error;
+
+       if (is_remote(r))
+               /* receive_convert() calls do_convert() on remote node */
+               error = send_convert(r, lkb);
+       else
+               error = do_convert(r, lkb);
+
+       return error;
+}
+
+/* remove an existing lkb from the granted queue */
+
+static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+       int error;
+
+       if (is_remote(r))
+               /* receive_unlock() calls do_unlock() on remote node */
+               error = send_unlock(r, lkb);
+       else
+               error = do_unlock(r, lkb);
+
+       return error;
+}
+
+/* remove an existing lkb from the convert or wait queue */
+
+static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+       int error;
+
+       if (is_remote(r))
+               /* receive_cancel() calls do_cancel() on remote node */
+               error = send_cancel(r, lkb);
+       else
+               error = do_cancel(r, lkb);
+
+       return error;
+}
+
+/*
+ * Four stage 2 varieties:
+ * request_lock(), convert_lock(), unlock_lock(), cancel_lock()
+ */
+
+static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name,
+                       int len, struct dlm_args *args)
+{
+       struct dlm_rsb *r;
+       int error;
+
+       error = validate_lock_args(ls, lkb, args);
+       if (error)
+               goto out;
+
+       error = find_rsb(ls, name, len, R_CREATE, &r);
+       if (error)
+               goto out;
+
+       lock_rsb(r);
+
+       attach_lkb(r, lkb);
+       lkb->lkb_lksb->sb_lkid = lkb->lkb_id;
+
+       error = _request_lock(r, lkb);
+
+       unlock_rsb(r);
+       put_rsb(r);
+
+ out:
+       return error;
+}
+
+static int convert_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                       struct dlm_args *args)
+{
+       struct dlm_rsb *r;
+       int error;
+
+       r = lkb->lkb_resource;
+
+       hold_rsb(r);
+       lock_rsb(r);
+
+       error = validate_lock_args(ls, lkb, args);
+       if (error)
+               goto out;
+
+       error = _convert_lock(r, lkb);
+ out:
+       unlock_rsb(r);
+       put_rsb(r);
+       return error;
+}
+
+static int unlock_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                      struct dlm_args *args)
+{
+       struct dlm_rsb *r;
+       int error;
+
+       r = lkb->lkb_resource;
+
+       hold_rsb(r);
+       lock_rsb(r);
+
+       error = validate_unlock_args(lkb, args);
+       if (error)
+               goto out;
+
+       error = _unlock_lock(r, lkb);
+ out:
+       unlock_rsb(r);
+       put_rsb(r);
+       return error;
+}
+
+static int cancel_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                      struct dlm_args *args)
+{
+       struct dlm_rsb *r;
+       int error;
+
+       r = lkb->lkb_resource;
+
+       hold_rsb(r);
+       lock_rsb(r);
+
+       error = validate_unlock_args(lkb, args);
+       if (error)
+               goto out;
+
+       error = _cancel_lock(r, lkb);
+ out:
+       unlock_rsb(r);
+       put_rsb(r);
+       return error;
+}
+
+/*
+ * Two stage 1 varieties:  dlm_lock() and dlm_unlock()
+ */
+
+int dlm_lock(dlm_lockspace_t *lockspace,
+            int mode,
+            struct dlm_lksb *lksb,
+            uint32_t flags,
+            void *name,
+            unsigned int namelen,
+            uint32_t parent_lkid,
+            void (*ast) (void *astarg),
+            void *astarg,
+            void (*bast) (void *astarg, int mode),
+            struct dlm_range *range)
+{
+       struct dlm_ls *ls;
+       struct dlm_lkb *lkb;
+       struct dlm_args args;
+       int error, convert = flags & DLM_LKF_CONVERT;
+
+       ls = dlm_find_lockspace_local(lockspace);
+       if (!ls)
+               return -EINVAL;
+
+       lock_recovery(ls);
+
+       if (convert)
+               error = find_lkb(ls, lksb->sb_lkid, &lkb);
+       else
+               error = create_lkb(ls, &lkb);
+
+       if (error)
+               goto out;
+
+       error = set_lock_args(mode, lksb, flags, namelen, parent_lkid, ast,
+                             astarg, bast, range, &args);
+       if (error)
+               goto out_put;
+
+       if (convert)
+               error = convert_lock(ls, lkb, &args);
+       else
+               error = request_lock(ls, lkb, name, namelen, &args);
+
+       if (error == -EINPROGRESS)
+               error = 0;
+ out_put:
+       if (convert || error)
+               put_lkb(lkb);
+       if (error == -EAGAIN)
+               error = 0;
+ out:
+       unlock_recovery(ls);
+       dlm_put_lockspace(ls);
+       return error;
+}
+
+int dlm_unlock(dlm_lockspace_t *lockspace,
+              uint32_t lkid,
+              uint32_t flags,
+              struct dlm_lksb *lksb,
+              void *astarg)
+{
+       struct dlm_ls *ls;
+       struct dlm_lkb *lkb;
+       struct dlm_args args;
+       int error;
+
+       ls = dlm_find_lockspace_local(lockspace);
+       if (!ls)
+               return -EINVAL;
+
+       lock_recovery(ls);
+
+       error = find_lkb(ls, lkid, &lkb);
+       if (error)
+               goto out;
+
+       error = set_unlock_args(flags, astarg, &args);
+       if (error)
+               goto out_put;
+
+       if (flags & DLM_LKF_CANCEL)
+               error = cancel_lock(ls, lkb, &args);
+       else
+               error = unlock_lock(ls, lkb, &args);
+
+       if (error == -DLM_EUNLOCK || error == -DLM_ECANCEL)
+               error = 0;
+ out_put:
+       put_lkb(lkb);
+ out:
+       unlock_recovery(ls);
+       dlm_put_lockspace(ls);
+       return error;
+}
+
+/*
+ * send/receive routines for remote operations and replies
+ *
+ * send_args
+ * send_common
+ * send_request                        receive_request
+ * send_convert                        receive_convert
+ * send_unlock                 receive_unlock
+ * send_cancel                 receive_cancel
+ * send_grant                  receive_grant
+ * send_bast                   receive_bast
+ * send_lookup                 receive_lookup
+ * send_remove                 receive_remove
+ *
+ *                             send_common_reply
+ * receive_request_reply       send_request_reply
+ * receive_convert_reply       send_convert_reply
+ * receive_unlock_reply                send_unlock_reply
+ * receive_cancel_reply                send_cancel_reply
+ * receive_lookup_reply                send_lookup_reply
+ */
+
+static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
+                         int to_nodeid, int mstype,
+                         struct dlm_message **ms_ret,
+                         struct dlm_mhandle **mh_ret)
+{
+       struct dlm_message *ms;
+       struct dlm_mhandle *mh;
+       char *mb;
+       int mb_len = sizeof(struct dlm_message);
+
+       switch (mstype) {
+       case DLM_MSG_REQUEST:
+       case DLM_MSG_LOOKUP:
+       case DLM_MSG_REMOVE:
+               mb_len += r->res_length;
+               break;
+       case DLM_MSG_CONVERT:
+       case DLM_MSG_UNLOCK:
+       case DLM_MSG_REQUEST_REPLY:
+       case DLM_MSG_CONVERT_REPLY:
+       case DLM_MSG_GRANT:
+               if (lkb && lkb->lkb_lvbptr)
+                       mb_len += r->res_ls->ls_lvblen;
+               break;
+       }
+
+       /* get_buffer gives us a message handle (mh) that we need to
+          pass into lowcomms_commit and a message buffer (mb) that we
+          write our data into */
+
+       mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
+       if (!mh)
+               return -ENOBUFS;
+
+       memset(mb, 0, mb_len);
+
+       ms = (struct dlm_message *) mb;
+
+       ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+       ms->m_header.h_lockspace = r->res_ls->ls_global_id;
+       ms->m_header.h_nodeid = dlm_our_nodeid();
+       ms->m_header.h_length = mb_len;
+       ms->m_header.h_cmd = DLM_MSG;
+
+       ms->m_type = mstype;
+
+       *mh_ret = mh;
+       *ms_ret = ms;
+       return 0;
+}
+
+/* further lowcomms enhancements or alternate implementations may make
+   the return value from this function useful at some point */
+
+static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
+{
+       dlm_message_out(ms);
+       dlm_lowcomms_commit_buffer(mh);
+       return 0;
+}
+
+static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
+                     struct dlm_message *ms)
+{
+       ms->m_nodeid   = lkb->lkb_nodeid;
+       ms->m_pid      = lkb->lkb_ownpid;
+       ms->m_lkid     = lkb->lkb_id;
+       ms->m_remid    = lkb->lkb_remid;
+       ms->m_exflags  = lkb->lkb_exflags;
+       ms->m_sbflags  = lkb->lkb_sbflags;
+       ms->m_flags    = lkb->lkb_flags;
+       ms->m_lvbseq   = lkb->lkb_lvbseq;
+       ms->m_status   = lkb->lkb_status;
+       ms->m_grmode   = lkb->lkb_grmode;
+       ms->m_rqmode   = lkb->lkb_rqmode;
+       ms->m_hash     = r->res_hash;
+
+       /* m_result and m_bastmode are set from function args,
+          not from lkb fields */
+
+       if (lkb->lkb_bastaddr)
+               ms->m_asts |= AST_BAST;
+       if (lkb->lkb_astaddr)
+               ms->m_asts |= AST_COMP;
+
+       if (lkb->lkb_range) {
+               ms->m_range[0] = lkb->lkb_range[RQ_RANGE_START];
+               ms->m_range[1] = lkb->lkb_range[RQ_RANGE_END];
+       }
+
+       if (ms->m_type == DLM_MSG_REQUEST || ms->m_type == DLM_MSG_LOOKUP)
+               memcpy(ms->m_extra, r->res_name, r->res_length);
+
+       else if (lkb->lkb_lvbptr)
+               memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
+
+}
+
+static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
+{
+       struct dlm_message *ms;
+       struct dlm_mhandle *mh;
+       int to_nodeid, error;
+
+       add_to_waiters(lkb, mstype);
+
+       to_nodeid = r->res_nodeid;
+
+       error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
+       if (error)
+               goto fail;
+
+       send_args(r, lkb, ms);
+
+       error = send_message(mh, ms);
+       if (error)
+               goto fail;
+       return 0;
+
+ fail:
+       remove_from_waiters(lkb);
+       return error;
+}
+
+static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+       return send_common(r, lkb, DLM_MSG_REQUEST);
+}
+
+static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+       int error;
+
+       error = send_common(r, lkb, DLM_MSG_CONVERT);
+
+       /* down conversions go without a reply from the master */
+       if (!error && down_conversion(lkb)) {
+               remove_from_waiters(lkb);
+               r->res_ls->ls_stub_ms.m_result = 0;
+               __receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
+       }
+
+       return error;
+}
+
+/* FIXME: if this lkb is the only lock we hold on the rsb, then set
+   MASTER_UNCERTAIN to force the next request on the rsb to confirm
+   that the master is still correct. */
+
+static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+       return send_common(r, lkb, DLM_MSG_UNLOCK);
+}
+
+static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+       return send_common(r, lkb, DLM_MSG_CANCEL);
+}
+
+static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+       struct dlm_message *ms;
+       struct dlm_mhandle *mh;
+       int to_nodeid, error;
+
+       to_nodeid = lkb->lkb_nodeid;
+
+       error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh);
+       if (error)
+               goto out;
+
+       send_args(r, lkb, ms);
+
+       ms->m_result = 0;
+
+       error = send_message(mh, ms);
+ out:
+       return error;
+}
+
+static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
+{
+       struct dlm_message *ms;
+       struct dlm_mhandle *mh;
+       int to_nodeid, error;
+
+       to_nodeid = lkb->lkb_nodeid;
+
+       error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh);
+       if (error)
+               goto out;
+
+       send_args(r, lkb, ms);
+
+       ms->m_bastmode = mode;
+
+       error = send_message(mh, ms);
+ out:
+       return error;
+}
+
+static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+       struct dlm_message *ms;
+       struct dlm_mhandle *mh;
+       int to_nodeid, error;
+
+       add_to_waiters(lkb, DLM_MSG_LOOKUP);
+
+       to_nodeid = dlm_dir_nodeid(r);
+
+       error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
+       if (error)
+               goto fail;
+
+       send_args(r, lkb, ms);
+
+       error = send_message(mh, ms);
+       if (error)
+               goto fail;
+       return 0;
+
+ fail:
+       remove_from_waiters(lkb);
+       return error;
+}
+
+static int send_remove(struct dlm_rsb *r)
+{
+       struct dlm_message *ms;
+       struct dlm_mhandle *mh;
+       int to_nodeid, error;
+
+       to_nodeid = dlm_dir_nodeid(r);
+
+       error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh);
+       if (error)
+               goto out;
+
+       memcpy(ms->m_extra, r->res_name, r->res_length);
+       ms->m_hash = r->res_hash;
+
+       error = send_message(mh, ms);
+ out:
+       return error;
+}
+
+static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
+                            int mstype, int rv)
+{
+       struct dlm_message *ms;
+       struct dlm_mhandle *mh;
+       int to_nodeid, error;
+
+       to_nodeid = lkb->lkb_nodeid;
+
+       error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
+       if (error)
+               goto out;
+
+       send_args(r, lkb, ms);
+
+       ms->m_result = rv;
+
+       error = send_message(mh, ms);
+ out:
+       return error;
+}
+
+static int send_request_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+       return send_common_reply(r, lkb, DLM_MSG_REQUEST_REPLY, rv);
+}
+
+static int send_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+       return send_common_reply(r, lkb, DLM_MSG_CONVERT_REPLY, rv);
+}
+
+static int send_unlock_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+       return send_common_reply(r, lkb, DLM_MSG_UNLOCK_REPLY, rv);
+}
+
+static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+       return send_common_reply(r, lkb, DLM_MSG_CANCEL_REPLY, rv);
+}
+
+static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
+                            int ret_nodeid, int rv)
+{
+       struct dlm_rsb *r = &ls->ls_stub_rsb;
+       struct dlm_message *ms;
+       struct dlm_mhandle *mh;
+       int error, nodeid = ms_in->m_header.h_nodeid;
+
+       error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh);
+       if (error)
+               goto out;
+
+       ms->m_lkid = ms_in->m_lkid;
+       ms->m_result = rv;
+       ms->m_nodeid = ret_nodeid;
+
+       error = send_message(mh, ms);
+ out:
+       return error;
+}
+
+/* which args we save from a received message depends heavily on the type
+   of message, unlike the send side where we can safely send everything about
+   the lkb for any type of message */
+
+static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+       lkb->lkb_exflags = ms->m_exflags;
+       lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
+                        (ms->m_flags & 0x0000FFFF);
+}
+
+static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+       lkb->lkb_sbflags = ms->m_sbflags;
+       lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
+                        (ms->m_flags & 0x0000FFFF);
+}
+
+static int receive_extralen(struct dlm_message *ms)
+{
+       return (ms->m_header.h_length - sizeof(struct dlm_message));
+}
+
+static int receive_range(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                        struct dlm_message *ms)
+{
+       if (lkb->lkb_flags & DLM_IFL_RANGE) {
+               if (!lkb->lkb_range)
+                       lkb->lkb_range = allocate_range(ls);
+               if (!lkb->lkb_range)
+                       return -ENOMEM;
+               lkb->lkb_range[RQ_RANGE_START] = ms->m_range[0];
+               lkb->lkb_range[RQ_RANGE_END] = ms->m_range[1];
+       }
+       return 0;
+}
+
+static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                      struct dlm_message *ms)
+{
+       int len;
+
+       if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
+               if (!lkb->lkb_lvbptr)
+                       lkb->lkb_lvbptr = allocate_lvb(ls);
+               if (!lkb->lkb_lvbptr)
+                       return -ENOMEM;
+               len = receive_extralen(ms);
+               memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
+       }
+       return 0;
+}
+
+static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                               struct dlm_message *ms)
+{
+       lkb->lkb_nodeid = ms->m_header.h_nodeid;
+       lkb->lkb_ownpid = ms->m_pid;
+       lkb->lkb_remid = ms->m_lkid;
+       lkb->lkb_grmode = DLM_LOCK_IV;
+       lkb->lkb_rqmode = ms->m_rqmode;
+       lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST);
+       lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP);
+
+       DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb););
+
+       if (receive_range(ls, lkb, ms))
+               return -ENOMEM;
+
+       if (receive_lvb(ls, lkb, ms))
+               return -ENOMEM;
+
+       return 0;
+}
+
+static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                               struct dlm_message *ms)
+{
+       if (lkb->lkb_nodeid != ms->m_header.h_nodeid) {
+               log_error(ls, "convert_args nodeid %d %d lkid %x %x",
+                         lkb->lkb_nodeid, ms->m_header.h_nodeid,
+                         lkb->lkb_id, lkb->lkb_remid);
+               return -EINVAL;
+       }
+
+       if (!is_master_copy(lkb))
+               return -EINVAL;
+
+       if (lkb->lkb_status != DLM_LKSTS_GRANTED)
+               return -EBUSY;
+
+       if (receive_range(ls, lkb, ms))
+               return -ENOMEM;
+       if (lkb->lkb_range) {
+               lkb->lkb_range[GR_RANGE_START] = 0LL;
+               lkb->lkb_range[GR_RANGE_END] = 0xffffffffffffffffULL;
+       }
+
+       if (receive_lvb(ls, lkb, ms))
+               return -ENOMEM;
+
+       lkb->lkb_rqmode = ms->m_rqmode;
+       lkb->lkb_lvbseq = ms->m_lvbseq;
+
+       return 0;
+}
+
+static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                              struct dlm_message *ms)
+{
+       if (!is_master_copy(lkb))
+               return -EINVAL;
+       if (receive_lvb(ls, lkb, ms))
+               return -ENOMEM;
+       return 0;
+}
+
+/* We fill in the stub-lkb fields with the info that send_xxxx_reply()
+   uses to send a reply and that the remote end uses to process the reply. */
+
+static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
+{
+       struct dlm_lkb *lkb = &ls->ls_stub_lkb;
+       lkb->lkb_nodeid = ms->m_header.h_nodeid;
+       lkb->lkb_remid = ms->m_lkid;
+}
+
+static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
+{
+       struct dlm_lkb *lkb;
+       struct dlm_rsb *r;
+       int error, namelen;
+
+       error = create_lkb(ls, &lkb);
+       if (error)
+               goto fail;
+
+       receive_flags(lkb, ms);
+       lkb->lkb_flags |= DLM_IFL_MSTCPY;
+       error = receive_request_args(ls, lkb, ms);
+       if (error) {
+               put_lkb(lkb);
+               goto fail;
+       }
+
+       namelen = receive_extralen(ms);
+
+       error = find_rsb(ls, ms->m_extra, namelen, R_MASTER, &r);
+       if (error) {
+               put_lkb(lkb);
+               goto fail;
+       }
+
+       lock_rsb(r);
+
+       attach_lkb(r, lkb);
+       error = do_request(r, lkb);
+       send_request_reply(r, lkb, error);
+
+       unlock_rsb(r);
+       put_rsb(r);
+
+       if (error == -EINPROGRESS)
+               error = 0;
+       if (error)
+               put_lkb(lkb);
+       return;
+
+ fail:
+       setup_stub_lkb(ls, ms);
+       send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+}
+
+static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
+{
+       struct dlm_lkb *lkb;
+       struct dlm_rsb *r;
+       int error, reply = TRUE;
+
+       error = find_lkb(ls, ms->m_remid, &lkb);
+       if (error)
+               goto fail;
+
+       r = lkb->lkb_resource;
+
+       hold_rsb(r);
+       lock_rsb(r);
+
+       receive_flags(lkb, ms);
+       error = receive_convert_args(ls, lkb, ms);
+       if (error)
+               goto out;
+       reply = !down_conversion(lkb);
+
+       error = do_convert(r, lkb);
+ out:
+       if (reply)
+               send_convert_reply(r, lkb, error);
+
+       unlock_rsb(r);
+       put_rsb(r);
+       put_lkb(lkb);
+       return;
+
+ fail:
+       setup_stub_lkb(ls, ms);
+       send_convert_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+}
+
+static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
+{
+       struct dlm_lkb *lkb;
+       struct dlm_rsb *r;
+       int error;
+
+       error = find_lkb(ls, ms->m_remid, &lkb);
+       if (error)
+               goto fail;
+
+       r = lkb->lkb_resource;
+
+       hold_rsb(r);
+       lock_rsb(r);
+
+       receive_flags(lkb, ms);
+       error = receive_unlock_args(ls, lkb, ms);
+       if (error)
+               goto out;
+
+       error = do_unlock(r, lkb);
+ out:
+       send_unlock_reply(r, lkb, error);
+
+       unlock_rsb(r);
+       put_rsb(r);
+       put_lkb(lkb);
+       return;
+
+ fail:
+       setup_stub_lkb(ls, ms);
+       send_unlock_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+}
+
+static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
+{
+       struct dlm_lkb *lkb;
+       struct dlm_rsb *r;
+       int error;
+
+       error = find_lkb(ls, ms->m_remid, &lkb);
+       if (error)
+               goto fail;
+
+       receive_flags(lkb, ms);
+
+       r = lkb->lkb_resource;
+
+       hold_rsb(r);
+       lock_rsb(r);
+
+       error = do_cancel(r, lkb);
+       send_cancel_reply(r, lkb, error);
+
+       unlock_rsb(r);
+       put_rsb(r);
+       put_lkb(lkb);
+       return;
+
+ fail:
+       setup_stub_lkb(ls, ms);
+       send_cancel_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+}
+
+static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
+{
+       struct dlm_lkb *lkb;
+       struct dlm_rsb *r;
+       int error;
+
+       error = find_lkb(ls, ms->m_remid, &lkb);
+       if (error) {
+               log_error(ls, "receive_grant no lkb");
+               return;
+       }
+       DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+       r = lkb->lkb_resource;
+
+       hold_rsb(r);
+       lock_rsb(r);
+
+       receive_flags_reply(lkb, ms);
+       grant_lock_pc(r, lkb, ms);
+       queue_cast(r, lkb, 0);
+
+       unlock_rsb(r);
+       put_rsb(r);
+       put_lkb(lkb);
+}
+
+static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
+{
+       struct dlm_lkb *lkb;
+       struct dlm_rsb *r;
+       int error;
+
+       error = find_lkb(ls, ms->m_remid, &lkb);
+       if (error) {
+               log_error(ls, "receive_bast no lkb");
+               return;
+       }
+       DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+       r = lkb->lkb_resource;
+
+       hold_rsb(r);
+       lock_rsb(r);
+
+       queue_bast(r, lkb, ms->m_bastmode);
+
+       unlock_rsb(r);
+       put_rsb(r);
+       put_lkb(lkb);
+}
+
+static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms)
+{
+       int len, error, ret_nodeid, dir_nodeid, from_nodeid, our_nodeid;
+
+       from_nodeid = ms->m_header.h_nodeid;
+       our_nodeid = dlm_our_nodeid();
+
+       len = receive_extralen(ms);
+
+       dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
+       if (dir_nodeid != our_nodeid) {
+               log_error(ls, "lookup dir_nodeid %d from %d",
+                         dir_nodeid, from_nodeid);
+               error = -EINVAL;
+               ret_nodeid = -1;
+               goto out;
+       }
+
+       error = dlm_dir_lookup(ls, from_nodeid, ms->m_extra, len, &ret_nodeid);
+
+       /* Optimization: we're master so treat lookup as a request */
+       if (!error && ret_nodeid == our_nodeid) {
+               receive_request(ls, ms);
+               return;
+       }
+ out:
+       send_lookup_reply(ls, ms, ret_nodeid, error);
+}
+
+static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
+{
+       int len, dir_nodeid, from_nodeid;
+
+       from_nodeid = ms->m_header.h_nodeid;
+
+       len = receive_extralen(ms);
+
+       dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
+       if (dir_nodeid != dlm_our_nodeid()) {
+               log_error(ls, "remove dir entry dir_nodeid %d from %d",
+                         dir_nodeid, from_nodeid);
+               return;
+       }
+
+       dlm_dir_remove_entry(ls, from_nodeid, ms->m_extra, len);
+}
+
+static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+       struct dlm_lkb *lkb;
+       struct dlm_rsb *r;
+       int error, mstype;
+
+       error = find_lkb(ls, ms->m_remid, &lkb);
+       if (error) {
+               log_error(ls, "receive_request_reply no lkb");
+               return;
+       }
+       DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+       mstype = lkb->lkb_wait_type;
+       error = remove_from_waiters(lkb);
+       if (error) {
+               log_error(ls, "receive_request_reply not on waiters");
+               goto out;
+       }
+
+       /* this is the value returned from do_request() on the master */
+       error = ms->m_result;
+
+       r = lkb->lkb_resource;
+       hold_rsb(r);
+       lock_rsb(r);
+
+       /* Optimization: the dir node was also the master, so it took our
+          lookup as a request and sent request reply instead of lookup reply */
+       if (mstype == DLM_MSG_LOOKUP) {
+               r->res_nodeid = ms->m_header.h_nodeid;
+               lkb->lkb_nodeid = r->res_nodeid;
+       }
+
+       switch (error) {
+       case -EAGAIN:
+               /* request would block (be queued) on remote master;
+                  the unhold undoes the original ref from create_lkb()
+                  so it leads to the lkb being freed */
+               queue_cast(r, lkb, -EAGAIN);
+               confirm_master(r, -EAGAIN);
+               unhold_lkb(lkb);
+               break;
+
+       case -EINPROGRESS:
+       case 0:
+               /* request was queued or granted on remote master */
+               receive_flags_reply(lkb, ms);
+               lkb->lkb_remid = ms->m_lkid;
+               if (error)
+                       add_lkb(r, lkb, DLM_LKSTS_WAITING);
+               else {
+                       grant_lock_pc(r, lkb, ms);
+                       queue_cast(r, lkb, 0);
+               }
+               confirm_master(r, error);
+               break;
+
+       case -ENOENT:
+       case -ENOTBLK:
+               /* find_rsb failed to find rsb or rsb wasn't master */
+               r->res_nodeid = -1;
+               lkb->lkb_nodeid = -1;
+               _request_lock(r, lkb);
+               break;
+
+       default:
+               log_error(ls, "receive_request_reply error %d", error);
+       }
+
+       unlock_rsb(r);
+       put_rsb(r);
+ out:
+       put_lkb(lkb);
+}
+
+static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
+                                   struct dlm_message *ms)
+{
+       int error = ms->m_result;
+
+       /* this is the value returned from do_convert() on the master */
+
+       switch (error) {
+       case -EAGAIN:
+               /* convert would block (be queued) on remote master */
+               queue_cast(r, lkb, -EAGAIN);
+               break;
+
+       case -EINPROGRESS:
+               /* convert was queued on remote master */
+               del_lkb(r, lkb);
+               add_lkb(r, lkb, DLM_LKSTS_CONVERT);
+               break;
+
+       case 0:
+               /* convert was granted on remote master */
+               receive_flags_reply(lkb, ms);
+               grant_lock_pc(r, lkb, ms);
+               queue_cast(r, lkb, 0);
+               break;
+
+       default:
+               log_error(r->res_ls, "receive_convert_reply error %d", error);
+       }
+}
+
+static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+       struct dlm_rsb *r = lkb->lkb_resource;
+
+       hold_rsb(r);
+       lock_rsb(r);
+
+       __receive_convert_reply(r, lkb, ms);
+
+       unlock_rsb(r);
+       put_rsb(r);
+}
+
+static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+       struct dlm_lkb *lkb;
+       int error;
+
+       error = find_lkb(ls, ms->m_remid, &lkb);
+       if (error) {
+               log_error(ls, "receive_convert_reply no lkb");
+               return;
+       }
+       DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+       error = remove_from_waiters(lkb);
+       if (error) {
+               log_error(ls, "receive_convert_reply not on waiters");
+               goto out;
+       }
+
+       _receive_convert_reply(lkb, ms);
+ out:
+       put_lkb(lkb);
+}
+
+static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+       struct dlm_rsb *r = lkb->lkb_resource;
+       int error = ms->m_result;
+
+       hold_rsb(r);
+       lock_rsb(r);
+
+       /* this is the value returned from do_unlock() on the master */
+
+       switch (error) {
+       case -DLM_EUNLOCK:
+               receive_flags_reply(lkb, ms);
+               remove_lock_pc(r, lkb);
+               queue_cast(r, lkb, -DLM_EUNLOCK);
+               break;
+       default:
+               log_error(r->res_ls, "receive_unlock_reply error %d", error);
+       }
+
+       unlock_rsb(r);
+       put_rsb(r);
+}
+
+static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+       struct dlm_lkb *lkb;
+       int error;
+
+       error = find_lkb(ls, ms->m_remid, &lkb);
+       if (error) {
+               log_error(ls, "receive_unlock_reply no lkb");
+               return;
+       }
+       DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+       error = remove_from_waiters(lkb);
+       if (error) {
+               log_error(ls, "receive_unlock_reply not on waiters");
+               goto out;
+       }
+
+       _receive_unlock_reply(lkb, ms);
+ out:
+       put_lkb(lkb);
+}
+
+static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+       struct dlm_rsb *r = lkb->lkb_resource;
+       int error = ms->m_result;
+
+       hold_rsb(r);
+       lock_rsb(r);
+
+       /* this is the value returned from do_cancel() on the master */
+
+       switch (error) {
+       case -DLM_ECANCEL:
+               receive_flags_reply(lkb, ms);
+               revert_lock_pc(r, lkb);
+               queue_cast(r, lkb, -DLM_ECANCEL);
+               break;
+       default:
+               log_error(r->res_ls, "receive_cancel_reply error %d", error);
+       }
+
+       unlock_rsb(r);
+       put_rsb(r);
+}
+
+static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+       struct dlm_lkb *lkb;
+       int error;
+
+       error = find_lkb(ls, ms->m_remid, &lkb);
+       if (error) {
+               log_error(ls, "receive_cancel_reply no lkb");
+               return;
+       }
+       DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+       error = remove_from_waiters(lkb);
+       if (error) {
+               log_error(ls, "receive_cancel_reply not on waiters");
+               goto out;
+       }
+
+       _receive_cancel_reply(lkb, ms);
+ out:
+       put_lkb(lkb);
+}
+
+static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+       struct dlm_lkb *lkb;
+       struct dlm_rsb *r;
+       int error, ret_nodeid;
+
+       error = find_lkb(ls, ms->m_lkid, &lkb);
+       if (error) {
+               log_error(ls, "receive_lookup_reply no lkb");
+               return;
+       }
+
+       error = remove_from_waiters(lkb);
+       if (error) {
+               log_error(ls, "receive_lookup_reply not on waiters");
+               goto out;
+       }
+
+       /* this is the value returned by dlm_dir_lookup on dir node
+          FIXME: will a non-zero error ever be returned? */
+       error = ms->m_result;
+
+       r = lkb->lkb_resource;
+       hold_rsb(r);
+       lock_rsb(r);
+
+       ret_nodeid = ms->m_nodeid;
+       if (ret_nodeid == dlm_our_nodeid()) {
+               r->res_nodeid = 0;
+               ret_nodeid = 0;
+               r->res_first_lkid = 0;
+       } else {
+               /* set_master() will copy res_nodeid to lkb_nodeid */
+               r->res_nodeid = ret_nodeid;
+       }
+
+       _request_lock(r, lkb);
+
+       if (!ret_nodeid)
+               process_lookup_list(r);
+
+       unlock_rsb(r);
+       put_rsb(r);
+ out:
+       put_lkb(lkb);
+}
+
+int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
+{
+       struct dlm_message *ms = (struct dlm_message *) hd;
+       struct dlm_ls *ls;
+       int error;
+
+       if (!recovery)
+               dlm_message_in(ms);
+
+       ls = dlm_find_lockspace_global(hd->h_lockspace);
+       if (!ls) {
+               log_print("drop message %d from %d for unknown lockspace %d",
+                         ms->m_type, nodeid, hd->h_lockspace);
+               return -EINVAL;
+       }
+
+       /* recovery may have just ended leaving a bunch of backed-up requests
+          in the requestqueue; wait while dlm_recoverd clears them */
+
+       if (!recovery)
+               dlm_wait_requestqueue(ls);
+
+       /* recovery may have just started while there were a bunch of
+          in-flight requests -- save them in requestqueue to be processed
+          after recovery.  we can't let dlm_recvd block on the recovery
+          lock.  if dlm_recoverd is calling this function to clear the
+          requestqueue, it needs to be interrupted (-EINTR) if another
+          recovery operation is starting. */
+
+       while (1) {
+               if (dlm_locking_stopped(ls)) {
+                       if (!recovery)
+                               dlm_add_requestqueue(ls, nodeid, hd);
+                       error = -EINTR;
+                       goto out;
+               }
+
+               if (lock_recovery_try(ls))
+                       break;
+               schedule();
+       }
+
+       switch (ms->m_type) {
+
+       /* messages sent to a master node */
+
+       case DLM_MSG_REQUEST:
+               receive_request(ls, ms);
+               break;
+
+       case DLM_MSG_CONVERT:
+               receive_convert(ls, ms);
+               break;
+
+       case DLM_MSG_UNLOCK:
+               receive_unlock(ls, ms);
+               break;
+
+       case DLM_MSG_CANCEL:
+               receive_cancel(ls, ms);
+               break;
+
+       /* messages sent from a master node (replies to above) */
+
+       case DLM_MSG_REQUEST_REPLY:
+               receive_request_reply(ls, ms);
+               break;
+
+       case DLM_MSG_CONVERT_REPLY:
+               receive_convert_reply(ls, ms);
+               break;
+
+       case DLM_MSG_UNLOCK_REPLY:
+               receive_unlock_reply(ls, ms);
+               break;
+
+       case DLM_MSG_CANCEL_REPLY:
+               receive_cancel_reply(ls, ms);
+               break;
+
+       /* messages sent from a master node (only two types of async msg) */
+
+       case DLM_MSG_GRANT:
+               receive_grant(ls, ms);
+               break;
+
+       case DLM_MSG_BAST:
+               receive_bast(ls, ms);
+               break;
+
+       /* messages sent to a dir node */
+
+       case DLM_MSG_LOOKUP:
+               receive_lookup(ls, ms);
+               break;
+
+       case DLM_MSG_REMOVE:
+               receive_remove(ls, ms);
+               break;
+
+       /* messages sent from a dir node (remove has no reply) */
+
+       case DLM_MSG_LOOKUP_REPLY:
+               receive_lookup_reply(ls, ms);
+               break;
+
+       default:
+               log_error(ls, "unknown message type %d", ms->m_type);
+       }
+
+       unlock_recovery(ls);
+ out:
+       dlm_put_lockspace(ls);
+       dlm_astd_wake();
+       return 0;
+}
+
+
+/*
+ * Recovery related
+ */
+
+static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+       if (middle_conversion(lkb)) {
+               hold_lkb(lkb);
+               ls->ls_stub_ms.m_result = -EINPROGRESS;
+               _remove_from_waiters(lkb);
+               _receive_convert_reply(lkb, &ls->ls_stub_ms);
+
+               /* Same special case as in receive_rcom_lock_args() */
+               lkb->lkb_grmode = DLM_LOCK_IV;
+               rsb_set_flag(lkb->lkb_resource, RSB_RECOVER_CONVERT);
+               unhold_lkb(lkb);
+
+       } else if (lkb->lkb_rqmode >= lkb->lkb_grmode) {
+               lkb->lkb_flags |= DLM_IFL_RESEND;
+       }
+
+       /* lkb->lkb_rqmode < lkb->lkb_grmode shouldn't happen since down
+          conversions are async; there's no reply from the remote master */
+}
+
+/* A waiting lkb needs recovery if the master node has failed, or
+   the master node is changing (only when no directory is used) */
+
+static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+       if (dlm_is_removed(ls, lkb->lkb_nodeid))
+               return 1;
+
+       if (!dlm_no_directory(ls))
+               return 0;
+
+       if (dlm_dir_nodeid(lkb->lkb_resource) != lkb->lkb_nodeid)
+               return 1;
+
+       return 0;
+}
+
+/* Recovery for locks that are waiting for replies from nodes that are now
+   gone.  We can just complete unlocks and cancels by faking a reply from the
+   dead node.  Requests and up-conversions we flag to be resent after
+   recovery.  Down-conversions can just be completed with a fake reply like
+   unlocks.  Conversions between PR and CW need special attention. */
+
+void dlm_recover_waiters_pre(struct dlm_ls *ls)
+{
+       struct dlm_lkb *lkb, *safe;
+
+       down(&ls->ls_waiters_sem);
+
+       list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
+               log_debug(ls, "pre recover waiter lkid %x type %d flags %x",
+                         lkb->lkb_id, lkb->lkb_wait_type, lkb->lkb_flags);
+
+               /* all outstanding lookups, regardless of destination  will be
+                  resent after recovery is done */
+
+               if (lkb->lkb_wait_type == DLM_MSG_LOOKUP) {
+                       lkb->lkb_flags |= DLM_IFL_RESEND;
+                       continue;
+               }
+
+               if (!waiter_needs_recovery(ls, lkb))
+                       continue;
+
+               switch (lkb->lkb_wait_type) {
+
+               case DLM_MSG_REQUEST:
+                       lkb->lkb_flags |= DLM_IFL_RESEND;
+                       break;
+
+               case DLM_MSG_CONVERT:
+                       recover_convert_waiter(ls, lkb);
+                       break;
+
+               case DLM_MSG_UNLOCK:
+                       hold_lkb(lkb);
+                       ls->ls_stub_ms.m_result = -DLM_EUNLOCK;
+                       _remove_from_waiters(lkb);
+                       _receive_unlock_reply(lkb, &ls->ls_stub_ms);
+                       put_lkb(lkb);
+                       break;
+
+               case DLM_MSG_CANCEL:
+                       hold_lkb(lkb);
+                       ls->ls_stub_ms.m_result = -DLM_ECANCEL;
+                       _remove_from_waiters(lkb);
+                       _receive_cancel_reply(lkb, &ls->ls_stub_ms);
+                       put_lkb(lkb);
+                       break;
+
+               default:
+                       log_error(ls, "invalid lkb wait_type %d",
+                                 lkb->lkb_wait_type);
+               }
+       }
+       up(&ls->ls_waiters_sem);
+}
+
+static int remove_resend_waiter(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
+{
+       struct dlm_lkb *lkb;
+       int rv = 0;
+
+       down(&ls->ls_waiters_sem);
+       list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
+               if (lkb->lkb_flags & DLM_IFL_RESEND) {
+                       rv = lkb->lkb_wait_type;
+                       _remove_from_waiters(lkb);
+                       lkb->lkb_flags &= ~DLM_IFL_RESEND;
+                       break;
+               }
+       }
+       up(&ls->ls_waiters_sem);
+
+       if (!rv)
+               lkb = NULL;
+       *lkb_ret = lkb;
+       return rv;
+}
+
+/* Deal with lookups and lkb's marked RESEND from _pre.  We may now be the
+   master or dir-node for r.  Processing the lkb may result in it being placed
+   back on waiters. */
+
+int dlm_recover_waiters_post(struct dlm_ls *ls)
+{
+       struct dlm_lkb *lkb;
+       struct dlm_rsb *r;
+       int error = 0, mstype;
+
+       while (1) {
+               if (dlm_locking_stopped(ls)) {
+                       log_debug(ls, "recover_waiters_post aborted");
+                       error = -EINTR;
+                       break;
+               }
+
+               mstype = remove_resend_waiter(ls, &lkb);
+               if (!mstype)
+                       break;
+
+               r = lkb->lkb_resource;
+
+               log_debug(ls, "recover_waiters_post %x type %d flags %x %s",
+                         lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name);
+
+               switch (mstype) {
+
+               case DLM_MSG_LOOKUP:
+                       hold_rsb(r);
+                       lock_rsb(r);
+                       _request_lock(r, lkb);
+                       if (is_master(r))
+                               confirm_master(r, 0);
+                       unlock_rsb(r);
+                       put_rsb(r);
+                       break;
+
+               case DLM_MSG_REQUEST:
+                       hold_rsb(r);
+                       lock_rsb(r);
+                       _request_lock(r, lkb);
+                       unlock_rsb(r);
+                       put_rsb(r);
+                       break;
+
+               case DLM_MSG_CONVERT:
+                       hold_rsb(r);
+                       lock_rsb(r);
+                       _convert_lock(r, lkb);
+                       unlock_rsb(r);
+                       put_rsb(r);
+                       break;
+
+               default:
+                       log_error(ls, "recover_waiters_post type %d", mstype);
+               }
+       }
+
+       return error;
+}
+
+static void purge_queue(struct dlm_rsb *r, struct list_head *queue,
+                       int (*test)(struct dlm_ls *ls, struct dlm_lkb *lkb))
+{
+       struct dlm_ls *ls = r->res_ls;
+       struct dlm_lkb *lkb, *safe;
+
+       list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
+               if (test(ls, lkb)) {
+                       del_lkb(r, lkb);
+                       /* this put should free the lkb */
+                       if (!put_lkb(lkb))
+                               log_error(ls, "purged lkb not released");
+               }
+       }
+}
+
+static int purge_dead_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+       return (is_master_copy(lkb) && dlm_is_removed(ls, lkb->lkb_nodeid));
+}
+
+static int purge_mstcpy_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+       return is_master_copy(lkb);
+}
+
+static void purge_dead_locks(struct dlm_rsb *r)
+{
+       purge_queue(r, &r->res_grantqueue, &purge_dead_test);
+       purge_queue(r, &r->res_convertqueue, &purge_dead_test);
+       purge_queue(r, &r->res_waitqueue, &purge_dead_test);
+}
+
+void dlm_purge_mstcpy_locks(struct dlm_rsb *r)
+{
+       purge_queue(r, &r->res_grantqueue, &purge_mstcpy_test);
+       purge_queue(r, &r->res_convertqueue, &purge_mstcpy_test);
+       purge_queue(r, &r->res_waitqueue, &purge_mstcpy_test);
+}
+
+/* Get rid of locks held by nodes that are gone. */
+
+int dlm_purge_locks(struct dlm_ls *ls)
+{
+       struct dlm_rsb *r;
+
+       log_debug(ls, "dlm_purge_locks");
+
+       down_write(&ls->ls_root_sem);
+       list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+               hold_rsb(r);
+               lock_rsb(r);
+               if (is_master(r))
+                       purge_dead_locks(r);
+               unlock_rsb(r);
+               unhold_rsb(r);
+
+               schedule();
+       }
+       up_write(&ls->ls_root_sem);
+
+       return 0;
+}
+
+int dlm_grant_after_purge(struct dlm_ls *ls)
+{
+       struct dlm_rsb *r;
+       int i;
+
+       for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+               read_lock(&ls->ls_rsbtbl[i].lock);
+               list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
+                       hold_rsb(r);
+                       lock_rsb(r);
+                       if (is_master(r)) {
+                               grant_pending_locks(r);
+                               confirm_master(r, 0);
+                       }
+                       unlock_rsb(r);
+                       put_rsb(r);
+               }
+               read_unlock(&ls->ls_rsbtbl[i].lock);
+       }
+
+       return 0;
+}
+
+static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid,
+                                        uint32_t remid)
+{
+       struct dlm_lkb *lkb;
+
+       list_for_each_entry(lkb, head, lkb_statequeue) {
+               if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid)
+                       return lkb;
+       }
+       return NULL;
+}
+
+static struct dlm_lkb *search_remid(struct dlm_rsb *r, int nodeid,
+                                   uint32_t remid)
+{
+       struct dlm_lkb *lkb;
+
+       lkb = search_remid_list(&r->res_grantqueue, nodeid, remid);
+       if (lkb)
+               return lkb;
+       lkb = search_remid_list(&r->res_convertqueue, nodeid, remid);
+       if (lkb)
+               return lkb;
+       lkb = search_remid_list(&r->res_waitqueue, nodeid, remid);
+       if (lkb)
+               return lkb;
+       return NULL;
+}
+
+static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                                 struct dlm_rsb *r, struct dlm_rcom *rc)
+{
+       struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
+       int lvblen;
+
+       lkb->lkb_nodeid = rc->rc_header.h_nodeid;
+       lkb->lkb_ownpid = rl->rl_ownpid;
+       lkb->lkb_remid = rl->rl_lkid;
+       lkb->lkb_exflags = rl->rl_exflags;
+       lkb->lkb_flags = rl->rl_flags & 0x0000FFFF;
+       lkb->lkb_flags |= DLM_IFL_MSTCPY;
+       lkb->lkb_lvbseq = rl->rl_lvbseq;
+       lkb->lkb_rqmode = rl->rl_rqmode;
+       lkb->lkb_grmode = rl->rl_grmode;
+       /* don't set lkb_status because add_lkb wants to itself */
+
+       lkb->lkb_bastaddr = (void *) (long) (rl->rl_asts & AST_BAST);
+       lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP);
+
+       if (lkb->lkb_flags & DLM_IFL_RANGE) {
+               lkb->lkb_range = allocate_range(ls);
+               if (!lkb->lkb_range)
+                       return -ENOMEM;
+               memcpy(lkb->lkb_range, rl->rl_range, 4*sizeof(uint64_t));
+       }
+
+       if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
+               lkb->lkb_lvbptr = allocate_lvb(ls);
+               if (!lkb->lkb_lvbptr)
+                       return -ENOMEM;
+               lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
+                        sizeof(struct rcom_lock);
+               memcpy(lkb->lkb_lvbptr, rl->rl_lvb, lvblen);
+       }
+
+       /* Conversions between PR and CW (middle modes) need special handling.
+          The real granted mode of these converting locks cannot be determined
+          until all locks have been rebuilt on the rsb (recover_conversion) */
+
+       if (rl->rl_wait_type == DLM_MSG_CONVERT && middle_conversion(lkb)) {
+               rl->rl_status = DLM_LKSTS_CONVERT;
+               lkb->lkb_grmode = DLM_LOCK_IV;
+               rsb_set_flag(r, RSB_RECOVER_CONVERT);
+       }
+
+       return 0;
+}
+
+/* This lkb may have been recovered in a previous aborted recovery so we need
+   to check if the rsb already has an lkb with the given remote nodeid/lkid.
+   If so we just send back a standard reply.  If not, we create a new lkb with
+   the given values and send back our lkid.  We send back our lkid by sending
+   back the rcom_lock struct we got but with the remid field filled in. */
+
+int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+       struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
+       struct dlm_rsb *r;
+       struct dlm_lkb *lkb;
+       int error;
+
+       if (rl->rl_parent_lkid) {
+               error = -EOPNOTSUPP;
+               goto out;
+       }
+
+       error = find_rsb(ls, rl->rl_name, rl->rl_namelen, R_MASTER, &r);
+       if (error)
+               goto out;
+
+       lock_rsb(r);
+
+       lkb = search_remid(r, rc->rc_header.h_nodeid, rl->rl_lkid);
+       if (lkb) {
+               error = -EEXIST;
+               goto out_remid;
+       }
+
+       error = create_lkb(ls, &lkb);
+       if (error)
+               goto out_unlock;
+
+       error = receive_rcom_lock_args(ls, lkb, r, rc);
+       if (error) {
+               put_lkb(lkb);
+               goto out_unlock;
+       }
+
+       attach_lkb(r, lkb);
+       add_lkb(r, lkb, rl->rl_status);
+       error = 0;
+
+ out_remid:
+       /* this is the new value returned to the lock holder for
+          saving in its process-copy lkb */
+       rl->rl_remid = lkb->lkb_id;
+
+ out_unlock:
+       unlock_rsb(r);
+       put_rsb(r);
+ out:
+       if (error)
+               log_print("recover_master_copy %d %x", error, rl->rl_lkid);
+       rl->rl_result = error;
+       return error;
+}
+
+int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+       struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
+       struct dlm_rsb *r;
+       struct dlm_lkb *lkb;
+       int error;
+
+       error = find_lkb(ls, rl->rl_lkid, &lkb);
+       if (error) {
+               log_error(ls, "recover_process_copy no lkid %x", rl->rl_lkid);
+               return error;
+       }
+
+       DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+       error = rl->rl_result;
+
+       r = lkb->lkb_resource;
+       hold_rsb(r);
+       lock_rsb(r);
+
+       switch (error) {
+       case -EEXIST:
+               log_debug(ls, "master copy exists %x", lkb->lkb_id);
+               /* fall through */
+       case 0:
+               lkb->lkb_remid = rl->rl_remid;
+               break;
+       default:
+               log_error(ls, "dlm_recover_process_copy unknown error %d %x",
+                         error, lkb->lkb_id);
+       }
+
+       /* an ack for dlm_recover_locks() which waits for replies from
+          all the locks it sends to new masters */
+       dlm_recovered_lock(r);
+
+       unlock_rsb(r);
+       put_rsb(r);
+       put_lkb(lkb);
+
+       return 0;
+}
+
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
new file mode 100644 (file)
index 0000000..9e6499f
--- /dev/null
@@ -0,0 +1,50 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LOCK_DOT_H__
+#define __LOCK_DOT_H__
+
+void dlm_print_rsb(struct dlm_rsb *r);
+int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery);
+int dlm_modes_compat(int mode1, int mode2);
+int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
+       unsigned int flags, struct dlm_rsb **r_ret);
+void dlm_put_rsb(struct dlm_rsb *r);
+void dlm_hold_rsb(struct dlm_rsb *r);
+int dlm_put_lkb(struct dlm_lkb *lkb);
+void dlm_scan_rsbs(struct dlm_ls *ls);
+
+int dlm_purge_locks(struct dlm_ls *ls);
+void dlm_purge_mstcpy_locks(struct dlm_rsb *r);
+int dlm_grant_after_purge(struct dlm_ls *ls);
+int dlm_recover_waiters_post(struct dlm_ls *ls);
+void dlm_recover_waiters_pre(struct dlm_ls *ls);
+int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
+int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
+
+static inline int is_master(struct dlm_rsb *r)
+{
+       return !r->res_nodeid;
+}
+
+static inline void lock_rsb(struct dlm_rsb *r)
+{
+       down(&r->res_sem);
+}
+
+static inline void unlock_rsb(struct dlm_rsb *r)
+{
+       up(&r->res_sem);
+}
+
+#endif
+
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
new file mode 100644 (file)
index 0000000..fee4659
--- /dev/null
@@ -0,0 +1,666 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "recoverd.h"
+#include "ast.h"
+#include "dir.h"
+#include "lowcomms.h"
+#include "config.h"
+#include "memory.h"
+#include "lock.h"
+
+#ifdef CONFIG_DLM_DEBUG
+int dlm_create_debug_file(struct dlm_ls *ls);
+void dlm_delete_debug_file(struct dlm_ls *ls);
+#else
+static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
+static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
+#endif
+
+static int                     ls_count;
+static struct semaphore                ls_lock;
+static struct list_head                lslist;
+static spinlock_t              lslist_lock;
+static struct task_struct *    scand_task;
+
+
+static ssize_t dlm_control_store(struct dlm_ls *ls, const char *buf, size_t len)
+{
+       ssize_t ret = len;
+       int n = simple_strtol(buf, NULL, 0);
+
+       switch (n) {
+       case 0:
+               dlm_ls_stop(ls);
+               break;
+       case 1:
+               dlm_ls_start(ls);
+               break;
+       default:
+               ret = -EINVAL;
+       }
+       return ret;
+}
+
+static ssize_t dlm_event_store(struct dlm_ls *ls, const char *buf, size_t len)
+{
+       ls->ls_uevent_result = simple_strtol(buf, NULL, 0);
+       set_bit(LSFL_UEVENT_WAIT, &ls->ls_flags);
+       wake_up(&ls->ls_uevent_wait);
+       return len;
+}
+
+static ssize_t dlm_id_show(struct dlm_ls *ls, char *buf)
+{
+       return sprintf(buf, "%u\n", ls->ls_global_id);
+}
+
+static ssize_t dlm_id_store(struct dlm_ls *ls, const char *buf, size_t len)
+{
+       ls->ls_global_id = simple_strtoul(buf, NULL, 0);
+       return len;
+}
+
+struct dlm_attr {
+       struct attribute attr;
+       ssize_t (*show)(struct dlm_ls *, char *);
+       ssize_t (*store)(struct dlm_ls *, const char *, size_t);
+};
+
+static struct dlm_attr dlm_attr_control = {
+       .attr  = {.name = "control", .mode = S_IWUSR},
+       .store = dlm_control_store
+};
+
+static struct dlm_attr dlm_attr_event = {
+       .attr  = {.name = "event_done", .mode = S_IWUSR},
+       .store = dlm_event_store
+};
+
+static struct dlm_attr dlm_attr_id = {
+       .attr  = {.name = "id", .mode = S_IRUGO | S_IWUSR},
+       .show  = dlm_id_show,
+       .store = dlm_id_store
+};
+
+static struct attribute *dlm_attrs[] = {
+       &dlm_attr_control.attr,
+       &dlm_attr_event.attr,
+       &dlm_attr_id.attr,
+       NULL,
+};
+
+static ssize_t dlm_attr_show(struct kobject *kobj, struct attribute *attr,
+                            char *buf)
+{
+       struct dlm_ls *ls  = container_of(kobj, struct dlm_ls, ls_kobj);
+       struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
+       return a->show ? a->show(ls, buf) : 0;
+}
+
+static ssize_t dlm_attr_store(struct kobject *kobj, struct attribute *attr,
+                             const char *buf, size_t len)
+{
+       struct dlm_ls *ls  = container_of(kobj, struct dlm_ls, ls_kobj);
+       struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
+       return a->store ? a->store(ls, buf, len) : len;
+}
+
+static struct sysfs_ops dlm_attr_ops = {
+       .show  = dlm_attr_show,
+       .store = dlm_attr_store,
+};
+
+static struct kobj_type dlm_ktype = {
+       .default_attrs = dlm_attrs,
+       .sysfs_ops     = &dlm_attr_ops,
+};
+
+static struct kset dlm_kset = {
+       .subsys = &kernel_subsys,
+       .kobj   = {.name = "dlm",},
+       .ktype  = &dlm_ktype,
+};
+
+static int kobject_setup(struct dlm_ls *ls)
+{
+       char lsname[DLM_LOCKSPACE_LEN];
+       int error;
+
+       memset(lsname, 0, DLM_LOCKSPACE_LEN);
+       snprintf(lsname, DLM_LOCKSPACE_LEN, "%s", ls->ls_name);
+
+       error = kobject_set_name(&ls->ls_kobj, "%s", lsname);
+       if (error)
+               return error;
+
+       ls->ls_kobj.kset = &dlm_kset;
+       ls->ls_kobj.ktype = &dlm_ktype;
+       return 0;
+}
+
+static int do_uevent(struct dlm_ls *ls, int in)
+{
+       int error;
+
+       if (in)
+               kobject_uevent(&ls->ls_kobj, KOBJ_ONLINE);
+       else
+               kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE);
+
+       error = wait_event_interruptible(ls->ls_uevent_wait,
+                       test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags));
+       if (error)
+               goto out;
+
+       error = ls->ls_uevent_result;
+ out:
+       return error;
+}
+
+
+int dlm_lockspace_init(void)
+{
+       int error;
+
+       ls_count = 0;
+       init_MUTEX(&ls_lock);
+       INIT_LIST_HEAD(&lslist);
+       spin_lock_init(&lslist_lock);
+
+       error = kset_register(&dlm_kset);
+       if (error)
+               printk("dlm_lockspace_init: cannot register kset %d\n", error);
+       return error;
+}
+
+void dlm_lockspace_exit(void)
+{
+       kset_unregister(&dlm_kset);
+}
+
+static int dlm_scand(void *data)
+{
+       struct dlm_ls *ls;
+
+       while (!kthread_should_stop()) {
+               list_for_each_entry(ls, &lslist, ls_list)
+                       dlm_scan_rsbs(ls);
+               schedule_timeout_interruptible(dlm_config.scan_secs * HZ);
+       }
+       return 0;
+}
+
+static int dlm_scand_start(void)
+{
+       struct task_struct *p;
+       int error = 0;
+
+       p = kthread_run(dlm_scand, NULL, "dlm_scand");
+       if (IS_ERR(p))
+               error = PTR_ERR(p);
+       else
+               scand_task = p;
+       return error;
+}
+
+static void dlm_scand_stop(void)
+{
+       kthread_stop(scand_task);
+}
+
+static struct dlm_ls *dlm_find_lockspace_name(char *name, int namelen)
+{
+       struct dlm_ls *ls;
+
+       spin_lock(&lslist_lock);
+
+       list_for_each_entry(ls, &lslist, ls_list) {
+               if (ls->ls_namelen == namelen &&
+                   memcmp(ls->ls_name, name, namelen) == 0)
+                       goto out;
+       }
+       ls = NULL;
+ out:
+       spin_unlock(&lslist_lock);
+       return ls;
+}
+
+struct dlm_ls *dlm_find_lockspace_global(uint32_t id)
+{
+       struct dlm_ls *ls;
+
+       spin_lock(&lslist_lock);
+
+       list_for_each_entry(ls, &lslist, ls_list) {
+               if (ls->ls_global_id == id) {
+                       ls->ls_count++;
+                       goto out;
+               }
+       }
+       ls = NULL;
+ out:
+       spin_unlock(&lslist_lock);
+       return ls;
+}
+
+struct dlm_ls *dlm_find_lockspace_local(void *id)
+{
+       struct dlm_ls *ls = id;
+
+       spin_lock(&lslist_lock);
+       ls->ls_count++;
+       spin_unlock(&lslist_lock);
+       return ls;
+}
+
+void dlm_put_lockspace(struct dlm_ls *ls)
+{
+       spin_lock(&lslist_lock);
+       ls->ls_count--;
+       spin_unlock(&lslist_lock);
+}
+
+static void remove_lockspace(struct dlm_ls *ls)
+{
+       for (;;) {
+               spin_lock(&lslist_lock);
+               if (ls->ls_count == 0) {
+                       list_del(&ls->ls_list);
+                       spin_unlock(&lslist_lock);
+                       return;
+               }
+               spin_unlock(&lslist_lock);
+               ssleep(1);
+       }
+}
+
+static int threads_start(void)
+{
+       int error;
+
+       /* Thread which process lock requests for all lockspace's */
+       error = dlm_astd_start();
+       if (error) {
+               log_print("cannot start dlm_astd thread %d", error);
+               goto fail;
+       }
+
+       error = dlm_scand_start();
+       if (error) {
+               log_print("cannot start dlm_scand thread %d", error);
+               goto astd_fail;
+       }
+
+       /* Thread for sending/receiving messages for all lockspace's */
+       error = dlm_lowcomms_start();
+       if (error) {
+               log_print("cannot start dlm lowcomms %d", error);
+               goto scand_fail;
+       }
+
+       return 0;
+
+ scand_fail:
+       dlm_scand_stop();
+ astd_fail:
+       dlm_astd_stop();
+ fail:
+       return error;
+}
+
+static void threads_stop(void)
+{
+       dlm_scand_stop();
+       dlm_lowcomms_stop();
+       dlm_astd_stop();
+}
+
+static int new_lockspace(char *name, int namelen, void **lockspace,
+                        uint32_t flags, int lvblen)
+{
+       struct dlm_ls *ls;
+       int i, size, error = -ENOMEM;
+
+       if (namelen > DLM_LOCKSPACE_LEN)
+               return -EINVAL;
+
+       if (!lvblen || (lvblen % 8))
+               return -EINVAL;
+
+       if (!try_module_get(THIS_MODULE))
+               return -EINVAL;
+
+       ls = dlm_find_lockspace_name(name, namelen);
+       if (ls) {
+               *lockspace = ls;
+               module_put(THIS_MODULE);
+               return -EEXIST;
+       }
+
+       ls = kmalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
+       if (!ls)
+               goto out;
+       memset(ls, 0, sizeof(struct dlm_ls) + namelen);
+       memcpy(ls->ls_name, name, namelen);
+       ls->ls_namelen = namelen;
+       ls->ls_exflags = flags;
+       ls->ls_lvblen = lvblen;
+       ls->ls_count = 0;
+       ls->ls_flags = 0;
+
+       size = dlm_config.rsbtbl_size;
+       ls->ls_rsbtbl_size = size;
+
+       ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL);
+       if (!ls->ls_rsbtbl)
+               goto out_lsfree;
+       for (i = 0; i < size; i++) {
+               INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list);
+               INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss);
+               rwlock_init(&ls->ls_rsbtbl[i].lock);
+       }
+
+       size = dlm_config.lkbtbl_size;
+       ls->ls_lkbtbl_size = size;
+
+       ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL);
+       if (!ls->ls_lkbtbl)
+               goto out_rsbfree;
+       for (i = 0; i < size; i++) {
+               INIT_LIST_HEAD(&ls->ls_lkbtbl[i].list);
+               rwlock_init(&ls->ls_lkbtbl[i].lock);
+               ls->ls_lkbtbl[i].counter = 1;
+       }
+
+       size = dlm_config.dirtbl_size;
+       ls->ls_dirtbl_size = size;
+
+       ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL);
+       if (!ls->ls_dirtbl)
+               goto out_lkbfree;
+       for (i = 0; i < size; i++) {
+               INIT_LIST_HEAD(&ls->ls_dirtbl[i].list);
+               rwlock_init(&ls->ls_dirtbl[i].lock);
+       }
+
+       INIT_LIST_HEAD(&ls->ls_waiters);
+       init_MUTEX(&ls->ls_waiters_sem);
+
+       INIT_LIST_HEAD(&ls->ls_nodes);
+       INIT_LIST_HEAD(&ls->ls_nodes_gone);
+       ls->ls_num_nodes = 0;
+       ls->ls_low_nodeid = 0;
+       ls->ls_total_weight = 0;
+       ls->ls_node_array = NULL;
+
+       memset(&ls->ls_stub_rsb, 0, sizeof(struct dlm_rsb));
+       ls->ls_stub_rsb.res_ls = ls;
+
+       ls->ls_debug_dentry = NULL;
+
+       init_waitqueue_head(&ls->ls_uevent_wait);
+       ls->ls_uevent_result = 0;
+
+       ls->ls_recoverd_task = NULL;
+       init_MUTEX(&ls->ls_recoverd_active);
+       spin_lock_init(&ls->ls_recover_lock);
+       ls->ls_recover_status = 0;
+       ls->ls_recover_seq = 0;
+       ls->ls_recover_args = NULL;
+       init_rwsem(&ls->ls_in_recovery);
+       INIT_LIST_HEAD(&ls->ls_requestqueue);
+       init_MUTEX(&ls->ls_requestqueue_lock);
+
+       ls->ls_recover_buf = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
+       if (!ls->ls_recover_buf)
+               goto out_dirfree;
+
+       INIT_LIST_HEAD(&ls->ls_recover_list);
+       spin_lock_init(&ls->ls_recover_list_lock);
+       ls->ls_recover_list_count = 0;
+       init_waitqueue_head(&ls->ls_wait_general);
+       INIT_LIST_HEAD(&ls->ls_root_list);
+       init_rwsem(&ls->ls_root_sem);
+
+       down_write(&ls->ls_in_recovery);
+
+       error = dlm_recoverd_start(ls);
+       if (error) {
+               log_error(ls, "can't start dlm_recoverd %d", error);
+               goto out_rcomfree;
+       }
+
+       spin_lock(&lslist_lock);
+       list_add(&ls->ls_list, &lslist);
+       spin_unlock(&lslist_lock);
+
+       dlm_create_debug_file(ls);
+
+       error = kobject_setup(ls);
+       if (error)
+               goto out_del;
+
+       error = kobject_register(&ls->ls_kobj);
+       if (error)
+               goto out_del;
+
+       error = do_uevent(ls, 1);
+       if (error)
+               goto out_unreg;
+
+       *lockspace = ls;
+       return 0;
+
+ out_unreg:
+       kobject_unregister(&ls->ls_kobj);
+ out_del:
+       dlm_delete_debug_file(ls);
+       spin_lock(&lslist_lock);
+       list_del(&ls->ls_list);
+       spin_unlock(&lslist_lock);
+       dlm_recoverd_stop(ls);
+ out_rcomfree:
+       kfree(ls->ls_recover_buf);
+ out_dirfree:
+       kfree(ls->ls_dirtbl);
+ out_lkbfree:
+       kfree(ls->ls_lkbtbl);
+ out_rsbfree:
+       kfree(ls->ls_rsbtbl);
+ out_lsfree:
+       kfree(ls);
+ out:
+       module_put(THIS_MODULE);
+       return error;
+}
+
+int dlm_new_lockspace(char *name, int namelen, void **lockspace,
+                     uint32_t flags, int lvblen)
+{
+       int error = 0;
+
+       down(&ls_lock);
+       if (!ls_count)
+               error = threads_start();
+       if (error)
+               goto out;
+
+       error = new_lockspace(name, namelen, lockspace, flags, lvblen);
+       if (!error)
+               ls_count++;
+ out:
+       up(&ls_lock);
+       return error;
+}
+
+/* Return 1 if the lockspace still has active remote locks,
+ *        2 if the lockspace still has active local locks.
+ */
+static int lockspace_busy(struct dlm_ls *ls)
+{
+       int i, lkb_found = 0;
+       struct dlm_lkb *lkb;
+
+       /* NOTE: We check the lockidtbl here rather than the resource table.
+          This is because there may be LKBs queued as ASTs that have been
+          unlinked from their RSBs and are pending deletion once the AST has
+          been delivered */
+
+       for (i = 0; i < ls->ls_lkbtbl_size; i++) {
+               read_lock(&ls->ls_lkbtbl[i].lock);
+               if (!list_empty(&ls->ls_lkbtbl[i].list)) {
+                       lkb_found = 1;
+                       list_for_each_entry(lkb, &ls->ls_lkbtbl[i].list,
+                                           lkb_idtbl_list) {
+                               if (!lkb->lkb_nodeid) {
+                                       read_unlock(&ls->ls_lkbtbl[i].lock);
+                                       return 2;
+                               }
+                       }
+               }
+               read_unlock(&ls->ls_lkbtbl[i].lock);
+       }
+       return lkb_found;
+}
+
+static int release_lockspace(struct dlm_ls *ls, int force)
+{
+       struct dlm_lkb *lkb;
+       struct dlm_rsb *rsb;
+       struct list_head *head;
+       int i;
+       int busy = lockspace_busy(ls);
+
+       if (busy > force)
+               return -EBUSY;
+
+       if (force < 3)
+               do_uevent(ls, 0);
+
+       dlm_recoverd_stop(ls);
+
+       remove_lockspace(ls);
+
+       dlm_delete_debug_file(ls);
+
+       dlm_astd_suspend();
+
+       kfree(ls->ls_recover_buf);
+
+       /*
+        * Free direntry structs.
+        */
+
+       dlm_dir_clear(ls);
+       kfree(ls->ls_dirtbl);
+
+       /*
+        * Free all lkb's on lkbtbl[] lists.
+        */
+
+       for (i = 0; i < ls->ls_lkbtbl_size; i++) {
+               head = &ls->ls_lkbtbl[i].list;
+               while (!list_empty(head)) {
+                       lkb = list_entry(head->next, struct dlm_lkb,
+                                        lkb_idtbl_list);
+
+                       list_del(&lkb->lkb_idtbl_list);
+
+                       dlm_del_ast(lkb);
+
+                       if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY)
+                               free_lvb(lkb->lkb_lvbptr);
+
+                       free_lkb(lkb);
+               }
+       }
+       dlm_astd_resume();
+
+       kfree(ls->ls_lkbtbl);
+
+       /*
+        * Free all rsb's on rsbtbl[] lists
+        */
+
+       for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+               head = &ls->ls_rsbtbl[i].list;
+               while (!list_empty(head)) {
+                       rsb = list_entry(head->next, struct dlm_rsb,
+                                        res_hashchain);
+
+                       list_del(&rsb->res_hashchain);
+                       free_rsb(rsb);
+               }
+
+               head = &ls->ls_rsbtbl[i].toss;
+               while (!list_empty(head)) {
+                       rsb = list_entry(head->next, struct dlm_rsb,
+                                        res_hashchain);
+                       list_del(&rsb->res_hashchain);
+                       free_rsb(rsb);
+               }
+       }
+
+       kfree(ls->ls_rsbtbl);
+
+       /*
+        * Free structures on any other lists
+        */
+
+       kfree(ls->ls_recover_args);
+       dlm_clear_free_entries(ls);
+       dlm_clear_members(ls);
+       dlm_clear_members_gone(ls);
+       kfree(ls->ls_node_array);
+       kobject_unregister(&ls->ls_kobj);
+       kfree(ls);
+
+       down(&ls_lock);
+       ls_count--;
+       if (!ls_count)
+               threads_stop();
+       up(&ls_lock);
+
+       module_put(THIS_MODULE);
+       return 0;
+}
+
+/*
+ * Called when a system has released all its locks and is not going to use the
+ * lockspace any longer.  We free everything we're managing for this lockspace.
+ * Remaining nodes will go through the recovery process as if we'd died.  The
+ * lockspace must continue to function as usual, participating in recoveries,
+ * until this returns.
+ *
+ * Force has 4 possible values:
+ * 0 - don't destroy locksapce if it has any LKBs
+ * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
+ * 2 - destroy lockspace regardless of LKBs
+ * 3 - destroy lockspace as part of a forced shutdown
+ */
+
+int dlm_release_lockspace(void *lockspace, int force)
+{
+       struct dlm_ls *ls;
+
+       ls = dlm_find_lockspace_local(lockspace);
+       if (!ls)
+               return -EINVAL;
+       dlm_put_lockspace(ls);
+       return release_lockspace(ls, force);
+}
+
diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h
new file mode 100644 (file)
index 0000000..17bd3ba
--- /dev/null
@@ -0,0 +1,24 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LOCKSPACE_DOT_H__
+#define __LOCKSPACE_DOT_H__
+
+int dlm_lockspace_init(void);
+void dlm_lockspace_exit(void);
+struct dlm_ls *dlm_find_lockspace_global(uint32_t id);
+struct dlm_ls *dlm_find_lockspace_local(void *id);
+void dlm_put_lockspace(struct dlm_ls *ls);
+
+#endif                         /* __LOCKSPACE_DOT_H__ */
+
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
new file mode 100644 (file)
index 0000000..09b0124
--- /dev/null
@@ -0,0 +1,1218 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/*
+ * lowcomms.c
+ *
+ * This is the "low-level" comms layer.
+ *
+ * It is responsible for sending/receiving messages
+ * from other nodes in the cluster.
+ *
+ * Cluster nodes are referred to by their nodeids. nodeids are
+ * simply 32 bit numbers to the locking module - if they need to
+ * be expanded for the cluster infrastructure then that is it's
+ * responsibility. It is this layer's
+ * responsibility to resolve these into IP address or
+ * whatever it needs for inter-node communication.
+ *
+ * The comms level is two kernel threads that deal mainly with
+ * the receiving of messages from other nodes and passing them
+ * up to the mid-level comms layer (which understands the
+ * message format) for execution by the locking core, and
+ * a send thread which does all the setting up of connections
+ * to remote nodes and the sending of data. Threads are not allowed
+ * to send their own data because it may cause them to wait in times
+ * of high load. Also, this way, the sending thread can collect together
+ * messages bound for one node and send them in one block.
+ *
+ * I don't see any problem with the recv thread executing the locking
+ * code on behalf of remote processes as the locking code is
+ * short, efficient and never (well, hardly ever) waits.
+ *
+ */
+
+#include <asm/ioctls.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/sctp/user.h>
+#include <linux/pagemap.h>
+#include <linux/socket.h>
+#include <linux/idr.h>
+
+#include "dlm_internal.h"
+#include "lowcomms.h"
+#include "config.h"
+#include "midcomms.h"
+
+static struct sockaddr_storage *local_addr[DLM_MAX_ADDR_COUNT];
+static int                     local_count;
+static int                     local_nodeid;
+
+/* One of these per connected node */
+
+#define NI_INIT_PENDING 1
+#define NI_WRITE_PENDING 2
+
+struct nodeinfo {
+       spinlock_t              lock;
+       sctp_assoc_t            assoc_id;
+       unsigned long           flags;
+       struct list_head        write_list; /* nodes with pending writes */
+       struct list_head        writequeue; /* outgoing writequeue_entries */
+       spinlock_t              writequeue_lock;
+       int                     nodeid;
+};
+
+static DEFINE_IDR(nodeinfo_idr);
+static struct rw_semaphore     nodeinfo_lock;
+static int                     max_nodeid;
+
+struct cbuf {
+       unsigned                base;
+       unsigned                len;
+       unsigned                mask;
+};
+
+/* Just the one of these, now. But this struct keeps
+   the connection-specific variables together */
+
+#define CF_READ_PENDING 1
+
+struct connection {
+       struct socket          *sock;
+       unsigned long           flags;
+       struct page            *rx_page;
+       atomic_t                waiting_requests;
+       struct cbuf             cb;
+       int                     eagain_flag;
+};
+
+/* An entry waiting to be sent */
+
+struct writequeue_entry {
+       struct list_head        list;
+       struct page            *page;
+       int                     offset;
+       int                     len;
+       int                     end;
+       int                     users;
+       struct nodeinfo        *ni;
+};
+
+#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0)
+#define CBUF_EMPTY(cb) ((cb)->len == 0)
+#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1))
+#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask)
+
+#define CBUF_INIT(cb, size) \
+do { \
+       (cb)->base = (cb)->len = 0; \
+       (cb)->mask = ((size)-1); \
+} while(0)
+
+#define CBUF_EAT(cb, n) \
+do { \
+       (cb)->len  -= (n); \
+       (cb)->base += (n); \
+       (cb)->base &= (cb)->mask; \
+} while(0)
+
+
+/* List of nodes which have writes pending */
+static struct list_head write_nodes;
+static spinlock_t write_nodes_lock;
+
+/* Maximum number of incoming messages to process before
+ * doing a schedule()
+ */
+#define MAX_RX_MSG_COUNT 25
+
+/* Manage daemons */
+static struct task_struct *recv_task;
+static struct task_struct *send_task;
+static wait_queue_head_t lowcomms_recv_wait;
+static atomic_t accepting;
+
+/* The SCTP connection */
+static struct connection sctp_con;
+
+
+static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
+{
+       struct sockaddr_storage addr;
+       int error;
+
+       if (!local_count)
+               return -1;
+
+       error = dlm_nodeid_to_addr(nodeid, &addr);
+       if (error)
+               return error;
+
+       if (local_addr[0]->ss_family == AF_INET) {
+               struct sockaddr_in *in4  = (struct sockaddr_in *) &addr;
+               struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr;
+               ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
+       } else {
+               struct sockaddr_in6 *in6  = (struct sockaddr_in6 *) &addr;
+               struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
+               memcpy(&ret6->sin6_addr, &in6->sin6_addr,
+                      sizeof(in6->sin6_addr));
+       }
+
+       return 0;
+}
+
+static struct nodeinfo *nodeid2nodeinfo(int nodeid, int alloc)
+{
+       struct nodeinfo *ni;
+       int r;
+       int n;
+
+       down_read(&nodeinfo_lock);
+       ni = idr_find(&nodeinfo_idr, nodeid);
+       up_read(&nodeinfo_lock);
+
+       if (!ni && alloc) {
+               down_write(&nodeinfo_lock);
+
+               ni = idr_find(&nodeinfo_idr, nodeid);
+               if (ni)
+                       goto out_up;
+
+               r = idr_pre_get(&nodeinfo_idr, alloc);
+               if (!r)
+                       goto out_up;
+
+               ni = kmalloc(sizeof(struct nodeinfo), alloc);
+               if (!ni)
+                       goto out_up;
+
+               r = idr_get_new_above(&nodeinfo_idr, ni, nodeid, &n);
+               if (r) {
+                       kfree(ni);
+                       ni = NULL;
+                       goto out_up;
+               }
+               if (n != nodeid) {
+                       idr_remove(&nodeinfo_idr, n);
+                       kfree(ni);
+                       ni = NULL;
+                       goto out_up;
+               }
+               memset(ni, 0, sizeof(struct nodeinfo));
+               spin_lock_init(&ni->lock);
+               INIT_LIST_HEAD(&ni->writequeue);
+               spin_lock_init(&ni->writequeue_lock);
+               ni->nodeid = nodeid;
+
+               if (nodeid > max_nodeid)
+                       max_nodeid = nodeid;
+       out_up:
+               up_write(&nodeinfo_lock);
+       }
+
+       return ni;
+}
+
+/* Don't call this too often... */
+static struct nodeinfo *assoc2nodeinfo(sctp_assoc_t assoc)
+{
+       int i;
+       struct nodeinfo *ni;
+
+       for (i=1; i<=max_nodeid; i++) {
+               ni = nodeid2nodeinfo(i, 0);
+               if (ni && ni->assoc_id == assoc)
+                       return ni;
+       }
+       return NULL;
+}
+
+/* Data or notification available on socket */
+static void lowcomms_data_ready(struct sock *sk, int count_unused)
+{
+       atomic_inc(&sctp_con.waiting_requests);
+       if (test_and_set_bit(CF_READ_PENDING, &sctp_con.flags))
+               return;
+
+       wake_up_interruptible(&lowcomms_recv_wait);
+}
+
+
+/* Add the port number to an IP6 or 4 sockaddr and return the address length.
+   Also padd out the struct with zeros to make comparisons meaningful */
+
+static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
+                         int *addr_len)
+{
+       struct sockaddr_in *local4_addr;
+       struct sockaddr_in6 *local6_addr;
+
+       if (!local_count)
+               return;
+
+       if (!port) {
+               if (local_addr[0]->ss_family == AF_INET) {
+                       local4_addr = (struct sockaddr_in *)local_addr[0];
+                       port = be16_to_cpu(local4_addr->sin_port);
+               } else {
+                       local6_addr = (struct sockaddr_in6 *)local_addr[0];
+                       port = be16_to_cpu(local6_addr->sin6_port);
+               }
+       }
+
+       saddr->ss_family = local_addr[0]->ss_family;
+       if (local_addr[0]->ss_family == AF_INET) {
+               struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
+               in4_addr->sin_port = cpu_to_be16(port);
+               memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
+               memset(in4_addr+1, 0, sizeof(struct sockaddr_storage) -
+                                     sizeof(struct sockaddr_in));
+               *addr_len = sizeof(struct sockaddr_in);
+       } else {
+               struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
+               in6_addr->sin6_port = cpu_to_be16(port);
+               memset(in6_addr+1, 0, sizeof(struct sockaddr_storage) -
+                                     sizeof(struct sockaddr_in6));
+               *addr_len = sizeof(struct sockaddr_in6);
+       }
+}
+
+/* Close the connection and tidy up */
+static void close_connection(void)
+{
+       if (sctp_con.sock) {
+               sock_release(sctp_con.sock);
+               sctp_con.sock = NULL;
+       }
+
+       if (sctp_con.rx_page) {
+               __free_page(sctp_con.rx_page);
+               sctp_con.rx_page = NULL;
+       }
+}
+
+/* We only send shutdown messages to nodes that are not part of the cluster */
+static void send_shutdown(sctp_assoc_t associd)
+{
+       static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+       struct msghdr outmessage;
+       struct cmsghdr *cmsg;
+       struct sctp_sndrcvinfo *sinfo;
+       int ret;
+
+       outmessage.msg_name = NULL;
+       outmessage.msg_namelen = 0;
+       outmessage.msg_control = outcmsg;
+       outmessage.msg_controllen = sizeof(outcmsg);
+       outmessage.msg_flags = MSG_EOR;
+
+       cmsg = CMSG_FIRSTHDR(&outmessage);
+       cmsg->cmsg_level = IPPROTO_SCTP;
+       cmsg->cmsg_type = SCTP_SNDRCV;
+       cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+       outmessage.msg_controllen = cmsg->cmsg_len;
+       sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+       memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
+
+       sinfo->sinfo_flags |= MSG_EOF;
+       sinfo->sinfo_assoc_id = associd;
+
+       ret = kernel_sendmsg(sctp_con.sock, &outmessage, NULL, 0, 0);
+
+       if (ret != 0)
+               log_print("send EOF to node failed: %d", ret);
+}
+
+
+/* INIT failed but we don't know which node...
+   restart INIT on all pending nodes */
+static void init_failed(void)
+{
+       int i;
+       struct nodeinfo *ni;
+
+       for (i=1; i<=max_nodeid; i++) {
+               ni = nodeid2nodeinfo(i, 0);
+               if (!ni)
+                       continue;
+
+               if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
+                       ni->assoc_id = 0;
+                       if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+                               spin_lock_bh(&write_nodes_lock);
+                               list_add_tail(&ni->write_list, &write_nodes);
+                               spin_unlock_bh(&write_nodes_lock);
+                       }
+               }
+       }
+       wake_up_process(send_task);
+}
+
+/* Something happened to an association */
+static void process_sctp_notification(struct msghdr *msg, char *buf)
+{
+       union sctp_notification *sn = (union sctp_notification *)buf;
+
+       if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
+               switch (sn->sn_assoc_change.sac_state) {
+
+               case SCTP_COMM_UP:
+               case SCTP_RESTART:
+               {
+                       /* Check that the new node is in the lockspace */
+                       struct sctp_prim prim;
+                       mm_segment_t fs;
+                       int nodeid;
+                       int prim_len, ret;
+                       int addr_len;
+                       struct nodeinfo *ni;
+
+                       /* This seems to happen when we received a connection
+                        * too early... or something...  anyway, it happens but
+                        * we always seem to get a real message too, see
+                        * receive_from_sock */
+
+                       if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) {
+                               log_print("COMM_UP for invalid assoc ID %d",
+                                        (int)sn->sn_assoc_change.sac_assoc_id);
+                               init_failed();
+                               return;
+                       }
+                       memset(&prim, 0, sizeof(struct sctp_prim));
+                       prim_len = sizeof(struct sctp_prim);
+                       prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id;
+
+                       fs = get_fs();
+                       set_fs(get_ds());
+                       ret = sctp_con.sock->ops->getsockopt(sctp_con.sock,
+                                               IPPROTO_SCTP, SCTP_PRIMARY_ADDR,
+                                               (char*)&prim, &prim_len);
+                       set_fs(fs);
+                       if (ret < 0) {
+                               struct nodeinfo *ni;
+
+                               log_print("getsockopt/sctp_primary_addr on "
+                                         "new assoc %d failed : %d",
+                                   (int)sn->sn_assoc_change.sac_assoc_id, ret);
+
+                               /* Retry INIT later */
+                               ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
+                               if (ni)
+                                       clear_bit(NI_INIT_PENDING, &ni->flags);
+                               return;
+                       }
+                       make_sockaddr(&prim.ssp_addr, 0, &addr_len);
+                       if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
+                               log_print("reject connect from unknown addr");
+                               send_shutdown(prim.ssp_assoc_id);
+                               return;
+                       }
+
+                       ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
+                       if (!ni)
+                               return;
+
+                       /* Save the assoc ID */
+                       spin_lock(&ni->lock);
+                       ni->assoc_id = sn->sn_assoc_change.sac_assoc_id;
+                       spin_unlock(&ni->lock);
+
+                       log_print("got new/restarted association %d nodeid %d",
+                              (int)sn->sn_assoc_change.sac_assoc_id, nodeid);
+
+                       /* Send any pending writes */
+                       clear_bit(NI_INIT_PENDING, &ni->flags);
+                       if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+                               spin_lock_bh(&write_nodes_lock);
+                               list_add_tail(&ni->write_list, &write_nodes);
+                               spin_unlock_bh(&write_nodes_lock);
+                       }
+                       wake_up_process(send_task);
+               }
+               break;
+
+               case SCTP_COMM_LOST:
+               case SCTP_SHUTDOWN_COMP:
+               {
+                       struct nodeinfo *ni;
+
+                       ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
+                       if (ni) {
+                               spin_lock(&ni->lock);
+                               ni->assoc_id = 0;
+                               spin_unlock(&ni->lock);
+                       }
+               }
+               break;
+
+               /* We don't know which INIT failed, so clear the PENDING flags
+                * on them all.  if assoc_id is zero then it will then try
+                * again */
+
+               case SCTP_CANT_STR_ASSOC:
+               {
+                       log_print("Can't start SCTP association - retrying");
+                       init_failed();
+               }
+               break;
+
+               default:
+                       log_print("unexpected SCTP assoc change id=%d state=%d",
+                                 (int)sn->sn_assoc_change.sac_assoc_id,
+                                 sn->sn_assoc_change.sac_state);
+               }
+       }
+}
+
+/* Data received from remote end */
+static int receive_from_sock(void)
+{
+       int ret = 0;
+       struct msghdr msg;
+       struct kvec iov[2];
+       unsigned len;
+       int r;
+       struct sctp_sndrcvinfo *sinfo;
+       struct cmsghdr *cmsg;
+       struct nodeinfo *ni;
+
+       /* These two are marginally too big for stack allocation, but this
+        * function is (currently) only called by dlm_recvd so static should be
+        * OK.
+        */
+       static struct sockaddr_storage msgname;
+       static char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+
+       if (sctp_con.sock == NULL)
+               goto out;
+
+       if (sctp_con.rx_page == NULL) {
+               /*
+                * This doesn't need to be atomic, but I think it should
+                * improve performance if it is.
+                */
+               sctp_con.rx_page = alloc_page(GFP_ATOMIC);
+               if (sctp_con.rx_page == NULL)
+                       goto out_resched;
+               CBUF_INIT(&sctp_con.cb, PAGE_CACHE_SIZE);
+       }
+
+       memset(&incmsg, 0, sizeof(incmsg));
+       memset(&msgname, 0, sizeof(msgname));
+
+       memset(incmsg, 0, sizeof(incmsg));
+       msg.msg_name = &msgname;
+       msg.msg_namelen = sizeof(msgname);
+       msg.msg_flags = 0;
+       msg.msg_control = incmsg;
+       msg.msg_controllen = sizeof(incmsg);
+
+       /* I don't see why this circular buffer stuff is necessary for SCTP
+        * which is a packet-based protocol, but the whole thing breaks under
+        * load without it! The overhead is minimal (and is in the TCP lowcomms
+        * anyway, of course) so I'll leave it in until I can figure out what's
+        * really happening.
+        */
+
+       /*
+        * iov[0] is the bit of the circular buffer between the current end
+        * point (cb.base + cb.len) and the end of the buffer.
+        */
+       iov[0].iov_len = sctp_con.cb.base - CBUF_DATA(&sctp_con.cb);
+       iov[0].iov_base = page_address(sctp_con.rx_page) +
+                         CBUF_DATA(&sctp_con.cb);
+       iov[1].iov_len = 0;
+
+       /*
+        * iov[1] is the bit of the circular buffer between the start of the
+        * buffer and the start of the currently used section (cb.base)
+        */
+       if (CBUF_DATA(&sctp_con.cb) >= sctp_con.cb.base) {
+               iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&sctp_con.cb);
+               iov[1].iov_len = sctp_con.cb.base;
+               iov[1].iov_base = page_address(sctp_con.rx_page);
+               msg.msg_iovlen = 2;
+       }
+       len = iov[0].iov_len + iov[1].iov_len;
+
+       r = ret = kernel_recvmsg(sctp_con.sock, &msg, iov, 1, len,
+                                MSG_NOSIGNAL | MSG_DONTWAIT);
+       if (ret <= 0)
+               goto out_close;
+
+       msg.msg_control = incmsg;
+       msg.msg_controllen = sizeof(incmsg);
+       cmsg = CMSG_FIRSTHDR(&msg);
+       sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+
+       if (msg.msg_flags & MSG_NOTIFICATION) {
+               process_sctp_notification(&msg, page_address(sctp_con.rx_page));
+               return 0;
+       }
+
+       /* Is this a new association ? */
+       ni = nodeid2nodeinfo(le32_to_cpu(sinfo->sinfo_ppid), GFP_KERNEL);
+       if (ni) {
+               ni->assoc_id = sinfo->sinfo_assoc_id;
+               if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
+
+                       if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+                               spin_lock_bh(&write_nodes_lock);
+                               list_add_tail(&ni->write_list, &write_nodes);
+                               spin_unlock_bh(&write_nodes_lock);
+                       }
+                       wake_up_process(send_task);
+               }
+       }
+
+       /* INIT sends a message with length of 1 - ignore it */
+       if (r == 1)
+               return 0;
+
+       CBUF_ADD(&sctp_con.cb, ret);
+       ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid),
+                                         page_address(sctp_con.rx_page),
+                                         sctp_con.cb.base, sctp_con.cb.len,
+                                         PAGE_CACHE_SIZE);
+       if (ret < 0)
+               goto out_close;
+       CBUF_EAT(&sctp_con.cb, ret);
+
+      out:
+       ret = 0;
+       goto out_ret;
+
+      out_resched:
+       lowcomms_data_ready(sctp_con.sock->sk, 0);
+       ret = 0;
+       schedule();
+       goto out_ret;
+
+      out_close:
+       if (ret != -EAGAIN)
+               log_print("error reading from sctp socket: %d", ret);
+      out_ret:
+       return ret;
+}
+
+/* Bind to an IP address. SCTP allows multiple address so it can do multi-homing */
+static int add_bind_addr(struct sockaddr_storage *addr, int addr_len, int num)
+{
+       mm_segment_t fs;
+       int result = 0;
+
+       fs = get_fs();
+       set_fs(get_ds());
+       if (num == 1)
+               result = sctp_con.sock->ops->bind(sctp_con.sock,
+                                       (struct sockaddr *) addr, addr_len);
+       else
+               result = sctp_con.sock->ops->setsockopt(sctp_con.sock, SOL_SCTP,
+                               SCTP_SOCKOPT_BINDX_ADD, (char *)addr, addr_len);
+       set_fs(fs);
+
+       if (result < 0)
+               log_print("Can't bind to port %d addr number %d",
+                         dlm_config.tcp_port, num);
+
+       return result;
+}
+
+static void init_local(void)
+{
+       struct sockaddr_storage sas, *addr;
+       int i;
+
+       local_nodeid = dlm_our_nodeid();
+
+       for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) {
+               if (dlm_our_addr(&sas, i))
+                       break;
+
+               addr = kmalloc(sizeof(*addr), GFP_KERNEL);
+               if (!addr)
+                       break;
+               memcpy(addr, &sas, sizeof(*addr));
+               local_addr[local_count++] = addr;
+       }
+}
+
+/* Initialise SCTP socket and bind to all interfaces */
+static int init_sock(void)
+{
+       mm_segment_t fs;
+       struct socket *sock = NULL;
+       struct sockaddr_storage localaddr;
+       struct sctp_event_subscribe subscribe;
+       int result = -EINVAL, num = 1, i, addr_len;
+
+       if (!local_count) {
+               init_local();
+               if (!local_count) {
+                       log_print("no local IP address has been set");
+                       goto out;
+               }
+       }
+
+       result = sock_create_kern(local_addr[0]->ss_family, SOCK_SEQPACKET,
+                                 IPPROTO_SCTP, &sock);
+       if (result < 0) {
+               log_print("Can't create comms socket, check SCTP is loaded");
+               goto out;
+       }
+
+       /* Listen for events */
+       memset(&subscribe, 0, sizeof(subscribe));
+       subscribe.sctp_data_io_event = 1;
+       subscribe.sctp_association_event = 1;
+       subscribe.sctp_send_failure_event = 1;
+       subscribe.sctp_shutdown_event = 1;
+       subscribe.sctp_partial_delivery_event = 1;
+
+       fs = get_fs();
+       set_fs(get_ds());
+       result = sock->ops->setsockopt(sock, SOL_SCTP, SCTP_EVENTS,
+                                      (char *)&subscribe, sizeof(subscribe));
+       set_fs(fs);
+
+       if (result < 0) {
+               log_print("Failed to set SCTP_EVENTS on socket: result=%d",
+                         result);
+               goto create_delsock;
+       }
+
+       /* Init con struct */
+       sock->sk->sk_user_data = &sctp_con;
+       sctp_con.sock = sock;
+       sctp_con.sock->sk->sk_data_ready = lowcomms_data_ready;
+
+       /* Bind to all interfaces. */
+       for (i = 0; i < local_count; i++) {
+               memcpy(&localaddr, local_addr[i], sizeof(localaddr));
+               make_sockaddr(&localaddr, dlm_config.tcp_port, &addr_len);
+
+               result = add_bind_addr(&localaddr, addr_len, num);
+               if (result)
+                       goto create_delsock;
+               ++num;
+       }
+
+       result = sock->ops->listen(sock, 5);
+       if (result < 0) {
+               log_print("Can't set socket listening");
+               goto create_delsock;
+       }
+
+       return 0;
+
+ create_delsock:
+       sock_release(sock);
+       sctp_con.sock = NULL;
+ out:
+       return result;
+}
+
+
+static struct writequeue_entry *new_writequeue_entry(int allocation)
+{
+       struct writequeue_entry *entry;
+
+       entry = kmalloc(sizeof(struct writequeue_entry), allocation);
+       if (!entry)
+               return NULL;
+
+       entry->page = alloc_page(allocation);
+       if (!entry->page) {
+               kfree(entry);
+               return NULL;
+       }
+
+       entry->offset = 0;
+       entry->len = 0;
+       entry->end = 0;
+       entry->users = 0;
+
+       return entry;
+}
+
+void *dlm_lowcomms_get_buffer(int nodeid, int len, int allocation, char **ppc)
+{
+       struct writequeue_entry *e;
+       int offset = 0;
+       int users = 0;
+       struct nodeinfo *ni;
+
+       if (!atomic_read(&accepting))
+               return NULL;
+
+       ni = nodeid2nodeinfo(nodeid, allocation);
+       if (!ni)
+               return NULL;
+
+       spin_lock(&ni->writequeue_lock);
+       e = list_entry(ni->writequeue.prev, struct writequeue_entry, list);
+       if (((struct list_head *) e == &ni->writequeue) ||
+           (PAGE_CACHE_SIZE - e->end < len)) {
+               e = NULL;
+       } else {
+               offset = e->end;
+               e->end += len;
+               users = e->users++;
+       }
+       spin_unlock(&ni->writequeue_lock);
+
+       if (e) {
+             got_one:
+               if (users == 0)
+                       kmap(e->page);
+               *ppc = page_address(e->page) + offset;
+               return e;
+       }
+
+       e = new_writequeue_entry(allocation);
+       if (e) {
+               spin_lock(&ni->writequeue_lock);
+               offset = e->end;
+               e->end += len;
+               e->ni = ni;
+               users = e->users++;
+               list_add_tail(&e->list, &ni->writequeue);
+               spin_unlock(&ni->writequeue_lock);
+               goto got_one;
+       }
+       return NULL;
+}
+
+void dlm_lowcomms_commit_buffer(void *arg)
+{
+       struct writequeue_entry *e = (struct writequeue_entry *) arg;
+       int users;
+       struct nodeinfo *ni = e->ni;
+
+       if (!atomic_read(&accepting))
+               return;
+
+       spin_lock(&ni->writequeue_lock);
+       users = --e->users;
+       if (users)
+               goto out;
+       e->len = e->end - e->offset;
+       kunmap(e->page);
+       spin_unlock(&ni->writequeue_lock);
+
+       if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+               spin_lock_bh(&write_nodes_lock);
+               list_add_tail(&ni->write_list, &write_nodes);
+               spin_unlock_bh(&write_nodes_lock);
+               wake_up_process(send_task);
+       }
+       return;
+
+      out:
+       spin_unlock(&ni->writequeue_lock);
+       return;
+}
+
+static void free_entry(struct writequeue_entry *e)
+{
+       __free_page(e->page);
+       kfree(e);
+}
+
+/* Initiate an SCTP association. In theory we could just use sendmsg() on
+   the first IP address and it should work, but this allows us to set up the
+   association before sending any valuable data that we can't afford to lose.
+   It also keeps the send path clean as it can now always use the association ID */
+static void initiate_association(int nodeid)
+{
+       struct sockaddr_storage rem_addr;
+       static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+       struct msghdr outmessage;
+       struct cmsghdr *cmsg;
+       struct sctp_sndrcvinfo *sinfo;
+       int ret;
+       int addrlen;
+       char buf[1];
+       struct kvec iov[1];
+       struct nodeinfo *ni;
+
+       log_print("Initiating association with node %d", nodeid);
+
+       ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
+       if (!ni)
+               return;
+
+       if (nodeid_to_addr(nodeid, (struct sockaddr *)&rem_addr)) {
+               log_print("no address for nodeid %d", nodeid);
+               return;
+       }
+
+       make_sockaddr(&rem_addr, dlm_config.tcp_port, &addrlen);
+
+       outmessage.msg_name = &rem_addr;
+       outmessage.msg_namelen = addrlen;
+       outmessage.msg_control = outcmsg;
+       outmessage.msg_controllen = sizeof(outcmsg);
+       outmessage.msg_flags = MSG_EOR;
+
+       iov[0].iov_base = buf;
+       iov[0].iov_len = 1;
+
+       /* Real INIT messages seem to cause trouble. Just send a 1 byte message
+          we can afford to lose */
+       cmsg = CMSG_FIRSTHDR(&outmessage);
+       cmsg->cmsg_level = IPPROTO_SCTP;
+       cmsg->cmsg_type = SCTP_SNDRCV;
+       cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+       sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+       memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
+       sinfo->sinfo_ppid = cpu_to_le32(local_nodeid);
+
+       outmessage.msg_controllen = cmsg->cmsg_len;
+       ret = kernel_sendmsg(sctp_con.sock, &outmessage, iov, 1, 1);
+       if (ret < 0) {
+               log_print("send INIT to node failed: %d", ret);
+               /* Try again later */
+               clear_bit(NI_INIT_PENDING, &ni->flags);
+       }
+}
+
+/* Send a message */
+static int send_to_sock(struct nodeinfo *ni)
+{
+       int ret = 0;
+       struct writequeue_entry *e;
+       int len, offset;
+       struct msghdr outmsg;
+       static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+       struct cmsghdr *cmsg;
+       struct sctp_sndrcvinfo *sinfo;
+       struct kvec iov;
+
+        /* See if we need to init an association before we start
+          sending precious messages */
+       spin_lock(&ni->lock);
+       if (!ni->assoc_id && !test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
+               spin_unlock(&ni->lock);
+               initiate_association(ni->nodeid);
+               return 0;
+       }
+       spin_unlock(&ni->lock);
+
+       outmsg.msg_name = NULL; /* We use assoc_id */
+       outmsg.msg_namelen = 0;
+       outmsg.msg_control = outcmsg;
+       outmsg.msg_controllen = sizeof(outcmsg);
+       outmsg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL | MSG_EOR;
+
+       cmsg = CMSG_FIRSTHDR(&outmsg);
+       cmsg->cmsg_level = IPPROTO_SCTP;
+       cmsg->cmsg_type = SCTP_SNDRCV;
+       cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+       sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+       memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
+       sinfo->sinfo_ppid = cpu_to_le32(local_nodeid);
+       sinfo->sinfo_assoc_id = ni->assoc_id;
+       outmsg.msg_controllen = cmsg->cmsg_len;
+
+       spin_lock(&ni->writequeue_lock);
+       for (;;) {
+               if (list_empty(&ni->writequeue))
+                       break;
+               e = list_entry(ni->writequeue.next, struct writequeue_entry,
+                              list);
+               kmap(e->page);
+               len = e->len;
+               offset = e->offset;
+               BUG_ON(len == 0 && e->users == 0);
+               spin_unlock(&ni->writequeue_lock);
+
+               ret = 0;
+               if (len) {
+                       iov.iov_base = page_address(e->page)+offset;
+                       iov.iov_len = len;
+
+                       ret = kernel_sendmsg(sctp_con.sock, &outmsg, &iov, 1,
+                                            len);
+                       if (ret == -EAGAIN) {
+                               sctp_con.eagain_flag = 1;
+                               goto out;
+                       } else if (ret < 0)
+                               goto send_error;
+               } else {
+                       /* Don't starve people filling buffers */
+                       schedule();
+               }
+
+               spin_lock(&ni->writequeue_lock);
+               e->offset += ret;
+               e->len -= ret;
+
+               if (e->len == 0 && e->users == 0) {
+                       list_del(&e->list);
+                       free_entry(e);
+                       continue;
+               }
+       }
+       spin_unlock(&ni->writequeue_lock);
+ out:
+       return ret;
+
+ send_error:
+       log_print("Error sending to node %d %d", ni->nodeid, ret);
+       spin_lock(&ni->lock);
+       if (!test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
+               ni->assoc_id = 0;
+               spin_unlock(&ni->lock);
+               initiate_association(ni->nodeid);
+       } else
+               spin_unlock(&ni->lock);
+
+       return ret;
+}
+
+/* Try to send any messages that are pending */
+static void process_output_queue(void)
+{
+       struct list_head *list;
+       struct list_head *temp;
+
+       spin_lock_bh(&write_nodes_lock);
+       list_for_each_safe(list, temp, &write_nodes) {
+               struct nodeinfo *ni =
+                   list_entry(list, struct nodeinfo, write_list);
+               clear_bit(NI_WRITE_PENDING, &ni->flags);
+               list_del(&ni->write_list);
+
+               spin_unlock_bh(&write_nodes_lock);
+
+               send_to_sock(ni);
+               spin_lock_bh(&write_nodes_lock);
+       }
+       spin_unlock_bh(&write_nodes_lock);
+}
+
+/* Called after we've had -EAGAIN and been woken up */
+static void refill_write_queue(void)
+{
+       int i;
+
+       for (i=1; i<=max_nodeid; i++) {
+               struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
+
+               if (ni) {
+                       if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+                               spin_lock_bh(&write_nodes_lock);
+                               list_add_tail(&ni->write_list, &write_nodes);
+                               spin_unlock_bh(&write_nodes_lock);
+                       }
+               }
+       }
+}
+
+static void clean_one_writequeue(struct nodeinfo *ni)
+{
+       struct list_head *list;
+       struct list_head *temp;
+
+       spin_lock(&ni->writequeue_lock);
+       list_for_each_safe(list, temp, &ni->writequeue) {
+               struct writequeue_entry *e =
+                       list_entry(list, struct writequeue_entry, list);
+               list_del(&e->list);
+               free_entry(e);
+       }
+       spin_unlock(&ni->writequeue_lock);
+}
+
+static void clean_writequeues(void)
+{
+       int i;
+
+       for (i=1; i<=max_nodeid; i++) {
+               struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
+               if (ni)
+                       clean_one_writequeue(ni);
+       }
+}
+
+
+static void dealloc_nodeinfo(void)
+{
+       int i;
+
+       for (i=1; i<=max_nodeid; i++) {
+               struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
+               if (ni) {
+                       idr_remove(&nodeinfo_idr, i);
+                       kfree(ni);
+               }
+       }
+}
+
+static int write_list_empty(void)
+{
+       int status;
+
+       spin_lock_bh(&write_nodes_lock);
+       status = list_empty(&write_nodes);
+       spin_unlock_bh(&write_nodes_lock);
+
+       return status;
+}
+
+static int dlm_recvd(void *data)
+{
+       DECLARE_WAITQUEUE(wait, current);
+
+       while (!kthread_should_stop()) {
+               int count = 0;
+
+               set_current_state(TASK_INTERRUPTIBLE);
+               add_wait_queue(&lowcomms_recv_wait, &wait);
+               if (!test_bit(CF_READ_PENDING, &sctp_con.flags))
+                       schedule();
+               remove_wait_queue(&lowcomms_recv_wait, &wait);
+               set_current_state(TASK_RUNNING);
+
+               if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) {
+                       int ret;
+
+                       do {
+                               ret = receive_from_sock();
+
+                               /* Don't starve out everyone else */
+                               if (++count >= MAX_RX_MSG_COUNT) {
+                                       schedule();
+                                       count = 0;
+                               }
+                       } while (!kthread_should_stop() && ret >=0);
+               }
+               schedule();
+       }
+
+       return 0;
+}
+
+static int dlm_sendd(void *data)
+{
+       DECLARE_WAITQUEUE(wait, current);
+
+       add_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
+
+       while (!kthread_should_stop()) {
+               set_current_state(TASK_INTERRUPTIBLE);
+               if (write_list_empty())
+                       schedule();
+               set_current_state(TASK_RUNNING);
+
+               if (sctp_con.eagain_flag) {
+                       sctp_con.eagain_flag = 0;
+                       refill_write_queue();
+               }
+               process_output_queue();
+       }
+
+       remove_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
+
+       return 0;
+}
+
+static void daemons_stop(void)
+{
+       kthread_stop(recv_task);
+       kthread_stop(send_task);
+}
+
+static int daemons_start(void)
+{
+       struct task_struct *p;
+       int error;
+
+       p = kthread_run(dlm_recvd, NULL, "dlm_recvd");
+       error = IS_ERR(p);
+               if (error) {
+               log_print("can't start dlm_recvd %d", error);
+               return error;
+       }
+       recv_task = p;
+
+       p = kthread_run(dlm_sendd, NULL, "dlm_sendd");
+       error = IS_ERR(p);
+               if (error) {
+               log_print("can't start dlm_sendd %d", error);
+               kthread_stop(recv_task);
+               return error;
+       }
+       send_task = p;
+
+       return 0;
+}
+
+/*
+ * This is quite likely to sleep...
+ */
+int dlm_lowcomms_start(void)
+{
+       int error;
+
+       spin_lock_init(&write_nodes_lock);
+       INIT_LIST_HEAD(&write_nodes);
+       init_rwsem(&nodeinfo_lock);
+
+       error = init_sock();
+       if (error)
+               goto fail_sock;
+       error = daemons_start();
+       if (error)
+               goto fail_sock;
+       atomic_set(&accepting, 1);
+       return 0;
+
+ fail_sock:
+       close_connection();
+       return error;
+}
+
+/* Set all the activity flags to prevent any socket activity. */
+
+void dlm_lowcomms_stop(void)
+{
+       atomic_set(&accepting, 0);
+       sctp_con.flags = 0x7;
+       daemons_stop();
+       clean_writequeues();
+       close_connection();
+       dealloc_nodeinfo();
+       max_nodeid = 0;
+}
+
+int dlm_lowcomms_init(void)
+{
+       init_waitqueue_head(&lowcomms_recv_wait);
+       return 0;
+}
+
+void dlm_lowcomms_exit(void)
+{
+       int i;
+
+       for (i = 0; i < local_count; i++)
+               kfree(local_addr[i]);
+       local_count = 0;
+       local_nodeid = 0;
+}
+
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
new file mode 100644 (file)
index 0000000..3af8035
--- /dev/null
@@ -0,0 +1,25 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LOWCOMMS_DOT_H__
+#define __LOWCOMMS_DOT_H__
+
+int dlm_lowcomms_init(void);
+void dlm_lowcomms_exit(void);
+int dlm_lowcomms_start(void);
+void dlm_lowcomms_stop(void);
+void *dlm_lowcomms_get_buffer(int nodeid, int len, int allocation, char **ppc);
+void dlm_lowcomms_commit_buffer(void *mh);
+
+#endif                         /* __LOWCOMMS_DOT_H__ */
+
diff --git a/fs/dlm/lvb_table.h b/fs/dlm/lvb_table.h
new file mode 100644 (file)
index 0000000..cc3e92f
--- /dev/null
@@ -0,0 +1,18 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LVB_TABLE_DOT_H__
+#define __LVB_TABLE_DOT_H__
+
+extern const int dlm_lvb_operations[8][8];
+
+#endif
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
new file mode 100644 (file)
index 0000000..81bf4cb
--- /dev/null
@@ -0,0 +1,89 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "lock.h"
+#include "memory.h"
+#include "lowcomms.h"
+#include "config.h"
+
+#ifdef CONFIG_DLM_DEBUG
+int dlm_register_debugfs(void);
+void dlm_unregister_debugfs(void);
+#else
+static inline int dlm_register_debugfs(void) { return 0; }
+static inline void dlm_unregister_debugfs(void) { }
+#endif
+
+static int __init init_dlm(void)
+{
+       int error;
+
+       error = dlm_memory_init();
+       if (error)
+               goto out;
+
+       error = dlm_lockspace_init();
+       if (error)
+               goto out_mem;
+
+       error = dlm_config_init();
+       if (error)
+               goto out_lockspace;
+
+       error = dlm_register_debugfs();
+       if (error)
+               goto out_config;
+
+       error = dlm_lowcomms_init();
+       if (error)
+               goto out_debug;
+
+       printk("DLM (built %s %s) installed\n", __DATE__, __TIME__);
+
+       return 0;
+
+ out_debug:
+       dlm_unregister_debugfs();
+ out_config:
+       dlm_config_exit();
+ out_lockspace:
+       dlm_lockspace_exit();
+ out_mem:
+       dlm_memory_exit();
+ out:
+       return error;
+}
+
+static void __exit exit_dlm(void)
+{
+       dlm_lowcomms_exit();
+       dlm_config_exit();
+       dlm_memory_exit();
+       dlm_lockspace_exit();
+       dlm_unregister_debugfs();
+}
+
+module_init(init_dlm);
+module_exit(exit_dlm);
+
+MODULE_DESCRIPTION("Distributed Lock Manager");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+EXPORT_SYMBOL_GPL(dlm_new_lockspace);
+EXPORT_SYMBOL_GPL(dlm_release_lockspace);
+EXPORT_SYMBOL_GPL(dlm_lock);
+EXPORT_SYMBOL_GPL(dlm_unlock);
+
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
new file mode 100644 (file)
index 0000000..439249b
--- /dev/null
@@ -0,0 +1,314 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "recoverd.h"
+#include "recover.h"
+#include "lowcomms.h"
+#include "rcom.h"
+#include "config.h"
+
+/*
+ * Following called by dlm_recoverd thread
+ */
+
+static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
+{
+       struct dlm_member *memb = NULL;
+       struct list_head *tmp;
+       struct list_head *newlist = &new->list;
+       struct list_head *head = &ls->ls_nodes;
+
+       list_for_each(tmp, head) {
+               memb = list_entry(tmp, struct dlm_member, list);
+               if (new->nodeid < memb->nodeid)
+                       break;
+       }
+
+       if (!memb)
+               list_add_tail(newlist, head);
+       else {
+               /* FIXME: can use list macro here */
+               newlist->prev = tmp->prev;
+               newlist->next = tmp;
+               tmp->prev->next = newlist;
+               tmp->prev = newlist;
+       }
+}
+
+static int dlm_add_member(struct dlm_ls *ls, int nodeid)
+{
+       struct dlm_member *memb;
+       int w;
+
+       memb = kmalloc(sizeof(struct dlm_member), GFP_KERNEL);
+       if (!memb)
+               return -ENOMEM;
+
+       w = dlm_node_weight(ls->ls_name, nodeid);
+       if (w < 0)
+               return w;
+
+       memb->nodeid = nodeid;
+       memb->weight = w;
+       add_ordered_member(ls, memb);
+       ls->ls_num_nodes++;
+       return 0;
+}
+
+static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb)
+{
+       list_move(&memb->list, &ls->ls_nodes_gone);
+       ls->ls_num_nodes--;
+}
+
+static int dlm_is_member(struct dlm_ls *ls, int nodeid)
+{
+       struct dlm_member *memb;
+
+       list_for_each_entry(memb, &ls->ls_nodes, list) {
+               if (memb->nodeid == nodeid)
+                       return TRUE;
+       }
+       return FALSE;
+}
+
+int dlm_is_removed(struct dlm_ls *ls, int nodeid)
+{
+       struct dlm_member *memb;
+
+       list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
+               if (memb->nodeid == nodeid)
+                       return TRUE;
+       }
+       return FALSE;
+}
+
+static void clear_memb_list(struct list_head *head)
+{
+       struct dlm_member *memb;
+
+       while (!list_empty(head)) {
+               memb = list_entry(head->next, struct dlm_member, list);
+               list_del(&memb->list);
+               kfree(memb);
+       }
+}
+
+void dlm_clear_members(struct dlm_ls *ls)
+{
+       clear_memb_list(&ls->ls_nodes);
+       ls->ls_num_nodes = 0;
+}
+
+void dlm_clear_members_gone(struct dlm_ls *ls)
+{
+       clear_memb_list(&ls->ls_nodes_gone);
+}
+
+static void make_member_array(struct dlm_ls *ls)
+{
+       struct dlm_member *memb;
+       int i, w, x = 0, total = 0, all_zero = 0, *array;
+
+       kfree(ls->ls_node_array);
+       ls->ls_node_array = NULL;
+
+       list_for_each_entry(memb, &ls->ls_nodes, list) {
+               if (memb->weight)
+                       total += memb->weight;
+       }
+
+       /* all nodes revert to weight of 1 if all have weight 0 */
+
+       if (!total) {
+               total = ls->ls_num_nodes;
+               all_zero = 1;
+       }
+
+       ls->ls_total_weight = total;
+
+       array = kmalloc(sizeof(int) * total, GFP_KERNEL);
+       if (!array)
+               return;
+
+       list_for_each_entry(memb, &ls->ls_nodes, list) {
+               if (!all_zero && !memb->weight)
+                       continue;
+
+               if (all_zero)
+                       w = 1;
+               else
+                       w = memb->weight;
+
+               DLM_ASSERT(x < total, printk("total %d x %d\n", total, x););
+
+               for (i = 0; i < w; i++)
+                       array[x++] = memb->nodeid;
+       }
+
+       ls->ls_node_array = array;
+}
+
+/* send a status request to all members just to establish comms connections */
+
+static void ping_members(struct dlm_ls *ls)
+{
+       struct dlm_member *memb;
+       list_for_each_entry(memb, &ls->ls_nodes, list)
+               dlm_rcom_status(ls, memb->nodeid);
+}
+
+int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
+{
+       struct dlm_member *memb, *safe;
+       int i, error, found, pos = 0, neg = 0, low = -1;
+
+       /* move departed members from ls_nodes to ls_nodes_gone */
+
+       list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) {
+               found = FALSE;
+               for (i = 0; i < rv->node_count; i++) {
+                       if (memb->nodeid == rv->nodeids[i]) {
+                               found = TRUE;
+                               break;
+                       }
+               }
+
+               if (!found) {
+                       neg++;
+                       dlm_remove_member(ls, memb);
+                       log_debug(ls, "remove member %d", memb->nodeid);
+               }
+       }
+
+       /* add new members to ls_nodes */
+
+       for (i = 0; i < rv->node_count; i++) {
+               if (dlm_is_member(ls, rv->nodeids[i]))
+                       continue;
+               dlm_add_member(ls, rv->nodeids[i]);
+               pos++;
+               log_debug(ls, "add member %d", rv->nodeids[i]);
+       }
+
+       list_for_each_entry(memb, &ls->ls_nodes, list) {
+               if (low == -1 || memb->nodeid < low)
+                       low = memb->nodeid;
+       }
+       ls->ls_low_nodeid = low;
+
+       make_member_array(ls);
+       dlm_set_recover_status(ls, DLM_RS_NODES);
+       *neg_out = neg;
+
+       ping_members(ls);
+
+       error = dlm_recover_members_wait(ls);
+       log_debug(ls, "total members %d", ls->ls_num_nodes);
+       return error;
+}
+
+/*
+ * Following called from lockspace.c
+ */
+
+int dlm_ls_stop(struct dlm_ls *ls)
+{
+       int new;
+
+       /*
+        * A stop cancels any recovery that's in progress (see RECOVERY_STOP,
+        * dlm_recovery_stopped()) and prevents any new locks from being
+        * processed (see RUNNING, dlm_locking_stopped()).
+        */
+
+       spin_lock(&ls->ls_recover_lock);
+       set_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
+       new = test_and_clear_bit(LSFL_RUNNING, &ls->ls_flags);
+       ls->ls_recover_seq++;
+       spin_unlock(&ls->ls_recover_lock);
+
+       /*
+        * This in_recovery lock does two things:
+        *
+        * 1) Keeps this function from returning until all threads are out
+        *    of locking routines and locking is truely stopped.
+        * 2) Keeps any new requests from being processed until it's unlocked
+        *    when recovery is complete.
+        */
+
+       if (new)
+               down_write(&ls->ls_in_recovery);
+
+       /*
+        * The recoverd suspend/resume makes sure that dlm_recoverd (if
+        * running) has noticed the clearing of RUNNING above and quit
+        * processing the previous recovery.  This will be true for all nodes
+        * before any nodes start the new recovery.
+        */
+
+       dlm_recoverd_suspend(ls);
+       ls->ls_recover_status = 0;
+       dlm_recoverd_resume(ls);
+       return 0;
+}
+
+int dlm_ls_start(struct dlm_ls *ls)
+{
+       struct dlm_recover *rv = NULL, *rv_old;
+       int *ids = NULL;
+       int error, count;
+
+       rv = kmalloc(sizeof(struct dlm_recover), GFP_KERNEL);
+       if (!rv)
+               return -ENOMEM;
+       memset(rv, 0, sizeof(struct dlm_recover));
+
+       error = count = dlm_nodeid_list(ls->ls_name, &ids);
+       if (error <= 0)
+               goto fail;
+
+       spin_lock(&ls->ls_recover_lock);
+
+       /* the lockspace needs to be stopped before it can be started */
+
+       if (!dlm_locking_stopped(ls)) {
+               spin_unlock(&ls->ls_recover_lock);
+               log_error(ls, "start ignored: lockspace running");
+               error = -EINVAL;
+               goto fail;
+       }
+
+       rv->nodeids = ids;
+       rv->node_count = count;
+       rv->seq = ++ls->ls_recover_seq;
+       rv_old = ls->ls_recover_args;
+       ls->ls_recover_args = rv;
+       spin_unlock(&ls->ls_recover_lock);
+
+       if (rv_old) {
+               kfree(rv_old->nodeids);
+               kfree(rv_old);
+       }
+
+       dlm_recoverd_kick(ls);
+       return 0;
+
+ fail:
+       kfree(rv);
+       kfree(ids);
+       return error;
+}
+
diff --git a/fs/dlm/member.h b/fs/dlm/member.h
new file mode 100644 (file)
index 0000000..927c08c
--- /dev/null
@@ -0,0 +1,24 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __MEMBER_DOT_H__
+#define __MEMBER_DOT_H__
+
+int dlm_ls_stop(struct dlm_ls *ls);
+int dlm_ls_start(struct dlm_ls *ls);
+void dlm_clear_members(struct dlm_ls *ls);
+void dlm_clear_members_gone(struct dlm_ls *ls);
+int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
+int dlm_is_removed(struct dlm_ls *ls, int nodeid);
+
+#endif                          /* __MEMBER_DOT_H__ */
+
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
new file mode 100644 (file)
index 0000000..0b9851d
--- /dev/null
@@ -0,0 +1,122 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "config.h"
+#include "memory.h"
+
+static kmem_cache_t *lkb_cache;
+
+
+int dlm_memory_init(void)
+{
+       int ret = 0;
+
+       lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb),
+                               __alignof__(struct dlm_lkb), 0, NULL, NULL);
+       if (!lkb_cache)
+               ret = -ENOMEM;
+       return ret;
+}
+
+void dlm_memory_exit(void)
+{
+       if (lkb_cache)
+               kmem_cache_destroy(lkb_cache);
+}
+
+char *allocate_lvb(struct dlm_ls *ls)
+{
+       char *p;
+
+       p = kmalloc(ls->ls_lvblen, GFP_KERNEL);
+       if (p)
+               memset(p, 0, ls->ls_lvblen);
+       return p;
+}
+
+void free_lvb(char *p)
+{
+       kfree(p);
+}
+
+uint64_t *allocate_range(struct dlm_ls *ls)
+{
+       int ralen = 4*sizeof(uint64_t);
+       uint64_t *p;
+
+       p = kmalloc(ralen, GFP_KERNEL);
+       if (p)
+               memset(p, 0, ralen);
+       return p;
+}
+
+void free_range(uint64_t *p)
+{
+       kfree(p);
+}
+
+/* FIXME: have some minimal space built-in to rsb for the name and
+   kmalloc a separate name if needed, like dentries are done */
+
+struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen)
+{
+       struct dlm_rsb *r;
+
+       DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
+
+       r = kmalloc(sizeof(*r) + namelen, GFP_KERNEL);
+       if (r)
+               memset(r, 0, sizeof(*r) + namelen);
+       return r;
+}
+
+void free_rsb(struct dlm_rsb *r)
+{
+       if (r->res_lvbptr)
+               free_lvb(r->res_lvbptr);
+       kfree(r);
+}
+
+struct dlm_lkb *allocate_lkb(struct dlm_ls *ls)
+{
+       struct dlm_lkb *lkb;
+
+       lkb = kmem_cache_alloc(lkb_cache, GFP_KERNEL);
+       if (lkb)
+               memset(lkb, 0, sizeof(*lkb));
+       return lkb;
+}
+
+void free_lkb(struct dlm_lkb *lkb)
+{
+       kmem_cache_free(lkb_cache, lkb);
+}
+
+struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen)
+{
+       struct dlm_direntry *de;
+
+       DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
+
+       de = kmalloc(sizeof(*de) + namelen, GFP_KERNEL);
+       if (de)
+               memset(de, 0, sizeof(*de) + namelen);
+       return de;
+}
+
+void free_direntry(struct dlm_direntry *de)
+{
+       kfree(de);
+}
+
diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h
new file mode 100644 (file)
index 0000000..7b23513
--- /dev/null
@@ -0,0 +1,31 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __MEMORY_DOT_H__
+#define __MEMORY_DOT_H__
+
+int dlm_memory_init(void);
+void dlm_memory_exit(void);
+struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen);
+void free_rsb(struct dlm_rsb *r);
+struct dlm_lkb *allocate_lkb(struct dlm_ls *ls);
+void free_lkb(struct dlm_lkb *l);
+struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen);
+void free_direntry(struct dlm_direntry *de);
+char *allocate_lvb(struct dlm_ls *ls);
+void free_lvb(char *l);
+uint64_t *allocate_range(struct dlm_ls *ls);
+void free_range(uint64_t *l);
+
+#endif         /* __MEMORY_DOT_H__ */
+
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
new file mode 100644 (file)
index 0000000..d96f9bb
--- /dev/null
@@ -0,0 +1,140 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/*
+ * midcomms.c
+ *
+ * This is the appallingly named "mid-level" comms layer.
+ *
+ * Its purpose is to take packets from the "real" comms layer,
+ * split them up into packets and pass them to the interested
+ * part of the locking mechanism.
+ *
+ * It also takes messages from the locking layer, formats them
+ * into packets and sends them to the comms layer.
+ */
+
+#include "dlm_internal.h"
+#include "lowcomms.h"
+#include "config.h"
+#include "rcom.h"
+#include "lock.h"
+#include "midcomms.h"
+
+
+static void copy_from_cb(void *dst, const void *base, unsigned offset,
+                        unsigned len, unsigned limit)
+{
+       unsigned copy = len;
+
+       if ((copy + offset) > limit)
+               copy = limit - offset;
+       memcpy(dst, base + offset, copy);
+       len -= copy;
+       if (len)
+               memcpy(dst + copy, base, len);
+}
+
+/*
+ * Called from the low-level comms layer to process a buffer of
+ * commands.
+ *
+ * Only complete messages are processed here, any "spare" bytes from
+ * the end of a buffer are saved and tacked onto the front of the next
+ * message that comes in. I doubt this will happen very often but we
+ * need to be able to cope with it and I don't want the task to be waiting
+ * for packets to come in when there is useful work to be done.
+ */
+
+int dlm_process_incoming_buffer(int nodeid, const void *base,
+                               unsigned offset, unsigned len, unsigned limit)
+{
+       unsigned char __tmp[DLM_INBUF_LEN];
+       struct dlm_header *msg = (struct dlm_header *) __tmp;
+       int ret = 0;
+       int err = 0;
+       uint16_t msglen;
+       uint32_t lockspace;
+
+       while (len > sizeof(struct dlm_header)) {
+
+               /* Copy just the header to check the total length.  The
+                  message may wrap around the end of the buffer back to the
+                  start, so we need to use a temp buffer and copy_from_cb. */
+
+               copy_from_cb(msg, base, offset, sizeof(struct dlm_header),
+                            limit);
+
+               msglen = le16_to_cpu(msg->h_length);
+               lockspace = msg->h_lockspace;
+
+               err = -EINVAL;
+               if (msglen < sizeof(struct dlm_header))
+                       break;
+               err = -E2BIG;
+               if (msglen > dlm_config.buffer_size) {
+                       log_print("message size %d from %d too big, buf len %d",
+                                 msglen, nodeid, len);
+                       break;
+               }
+               err = 0;
+
+               /* If only part of the full message is contained in this
+                  buffer, then do nothing and wait for lowcomms to call
+                  us again later with more data.  We return 0 meaning
+                  we've consumed none of the input buffer. */
+
+               if (msglen > len)
+                       break;
+
+               /* Allocate a larger temp buffer if the full message won't fit
+                  in the buffer on the stack (which should work for most
+                  ordinary messages). */
+
+               if (msglen > sizeof(__tmp) &&
+                   msg == (struct dlm_header *) __tmp) {
+                       msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
+                       if (msg == NULL)
+                               return ret;
+               }
+
+               copy_from_cb(msg, base, offset, msglen, limit);
+
+               BUG_ON(lockspace != msg->h_lockspace);
+
+               ret += msglen;
+               offset += msglen;
+               offset &= (limit - 1);
+               len -= msglen;
+
+               switch (msg->h_cmd) {
+               case DLM_MSG:
+                       dlm_receive_message(msg, nodeid, FALSE);
+                       break;
+
+               case DLM_RCOM:
+                       dlm_receive_rcom(msg, nodeid);
+                       break;
+
+               default:
+                       log_print("unknown msg type %x from %u: %u %u %u %u",
+                                 msg->h_cmd, nodeid, msglen, len, offset, ret);
+               }
+       }
+
+       if (msg != (struct dlm_header *) __tmp)
+               kfree(msg);
+
+       return err ? err : ret;
+}
+
diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
new file mode 100644 (file)
index 0000000..95852a5
--- /dev/null
@@ -0,0 +1,21 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __MIDCOMMS_DOT_H__
+#define __MIDCOMMS_DOT_H__
+
+int dlm_process_incoming_buffer(int nodeid, const void *base, unsigned offset,
+                               unsigned len, unsigned limit);
+
+#endif                         /* __MIDCOMMS_DOT_H__ */
+
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
new file mode 100644 (file)
index 0000000..4c5c08a
--- /dev/null
@@ -0,0 +1,460 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "lowcomms.h"
+#include "midcomms.h"
+#include "rcom.h"
+#include "recover.h"
+#include "dir.h"
+#include "config.h"
+#include "memory.h"
+#include "lock.h"
+#include "util.h"
+
+
+static int rcom_response(struct dlm_ls *ls)
+{
+       return test_bit(LSFL_RCOM_READY, &ls->ls_flags);
+}
+
+static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
+                      struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret)
+{
+       struct dlm_rcom *rc;
+       struct dlm_mhandle *mh;
+       char *mb;
+       int mb_len = sizeof(struct dlm_rcom) + len;
+
+       mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
+       if (!mh) {
+               log_print("create_rcom to %d type %d len %d ENOBUFS",
+                         to_nodeid, type, len);
+               return -ENOBUFS;
+       }
+       memset(mb, 0, mb_len);
+
+       rc = (struct dlm_rcom *) mb;
+
+       rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+       rc->rc_header.h_lockspace = ls->ls_global_id;
+       rc->rc_header.h_nodeid = dlm_our_nodeid();
+       rc->rc_header.h_length = mb_len;
+       rc->rc_header.h_cmd = DLM_RCOM;
+
+       rc->rc_type = type;
+
+       *mh_ret = mh;
+       *rc_ret = rc;
+       return 0;
+}
+
+static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh,
+                     struct dlm_rcom *rc)
+{
+       dlm_rcom_out(rc);
+       dlm_lowcomms_commit_buffer(mh);
+}
+
+/* When replying to a status request, a node also sends back its
+   configuration values.  The requesting node then checks that the remote
+   node is configured the same way as itself. */
+
+static void make_config(struct dlm_ls *ls, struct rcom_config *rf)
+{
+       rf->rf_lvblen = ls->ls_lvblen;
+       rf->rf_lsflags = ls->ls_exflags;
+}
+
+static int check_config(struct dlm_ls *ls, struct rcom_config *rf, int nodeid)
+{
+       if (rf->rf_lvblen != ls->ls_lvblen ||
+           rf->rf_lsflags != ls->ls_exflags) {
+               log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x",
+                         ls->ls_lvblen, ls->ls_exflags,
+                         nodeid, rf->rf_lvblen, rf->rf_lsflags);
+               return -EINVAL;
+       }
+       return 0;
+}
+
+int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
+{
+       struct dlm_rcom *rc;
+       struct dlm_mhandle *mh;
+       int error = 0;
+
+       memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
+
+       if (nodeid == dlm_our_nodeid()) {
+               rc = (struct dlm_rcom *) ls->ls_recover_buf;
+               rc->rc_result = dlm_recover_status(ls);
+               goto out;
+       }
+
+       error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, 0, &rc, &mh);
+       if (error)
+               goto out;
+
+       send_rcom(ls, mh, rc);
+
+       error = dlm_wait_function(ls, &rcom_response);
+       clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
+       if (error)
+               goto out;
+
+       rc = (struct dlm_rcom *) ls->ls_recover_buf;
+
+       if (rc->rc_result == -ESRCH) {
+               /* we pretend the remote lockspace exists with 0 status */
+               log_debug(ls, "remote node %d not ready", nodeid);
+               rc->rc_result = 0;
+       } else
+               error = check_config(ls, (struct rcom_config *) rc->rc_buf,
+                                    nodeid);
+       /* the caller looks at rc_result for the remote recovery status */
+ out:
+       return error;
+}
+
+static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+       struct dlm_rcom *rc;
+       struct dlm_mhandle *mh;
+       int error, nodeid = rc_in->rc_header.h_nodeid;
+
+       error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY,
+                           sizeof(struct rcom_config), &rc, &mh);
+       if (error)
+               return;
+       rc->rc_result = dlm_recover_status(ls);
+       make_config(ls, (struct rcom_config *) rc->rc_buf);
+
+       send_rcom(ls, mh, rc);
+}
+
+static void receive_rcom_status_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+       memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length);
+       set_bit(LSFL_RCOM_READY, &ls->ls_flags);
+       wake_up(&ls->ls_wait_general);
+}
+
+int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
+{
+       struct dlm_rcom *rc;
+       struct dlm_mhandle *mh;
+       int error = 0, len = sizeof(struct dlm_rcom);
+
+       memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
+
+       if (nodeid == dlm_our_nodeid()) {
+               dlm_copy_master_names(ls, last_name, last_len,
+                                     ls->ls_recover_buf + len,
+                                     dlm_config.buffer_size - len, nodeid);
+               goto out;
+       }
+
+       error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh);
+       if (error)
+               goto out;
+       memcpy(rc->rc_buf, last_name, last_len);
+
+       send_rcom(ls, mh, rc);
+
+       error = dlm_wait_function(ls, &rcom_response);
+       clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
+ out:
+       return error;
+}
+
+static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+       struct dlm_rcom *rc;
+       struct dlm_mhandle *mh;
+       int error, inlen, outlen;
+       int nodeid = rc_in->rc_header.h_nodeid;
+       uint32_t status = dlm_recover_status(ls);
+
+       /*
+        * We can't run dlm_dir_rebuild_send (which uses ls_nodes) while
+        * dlm_recoverd is running ls_nodes_reconfig (which changes ls_nodes).
+        * It could only happen in rare cases where we get a late NAMES
+        * message from a previous instance of recovery.
+        */
+
+       if (!(status & DLM_RS_NODES)) {
+               log_debug(ls, "ignoring RCOM_NAMES from %u", nodeid);
+               return;
+       }
+
+       nodeid = rc_in->rc_header.h_nodeid;
+       inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
+       outlen = dlm_config.buffer_size - sizeof(struct dlm_rcom);
+
+       error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh);
+       if (error)
+               return;
+
+       dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen,
+                             nodeid);
+       send_rcom(ls, mh, rc);
+}
+
+static void receive_rcom_names_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+       memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length);
+       set_bit(LSFL_RCOM_READY, &ls->ls_flags);
+       wake_up(&ls->ls_wait_general);
+}
+
+int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
+{
+       struct dlm_rcom *rc;
+       struct dlm_mhandle *mh;
+       struct dlm_ls *ls = r->res_ls;
+       int error;
+
+       error = create_rcom(ls, dir_nodeid, DLM_RCOM_LOOKUP, r->res_length,
+                           &rc, &mh);
+       if (error)
+               goto out;
+       memcpy(rc->rc_buf, r->res_name, r->res_length);
+       rc->rc_id = (unsigned long) r;
+
+       send_rcom(ls, mh, rc);
+ out:
+       return error;
+}
+
+static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+       struct dlm_rcom *rc;
+       struct dlm_mhandle *mh;
+       int error, ret_nodeid, nodeid = rc_in->rc_header.h_nodeid;
+       int len = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
+
+       error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh);
+       if (error)
+               return;
+
+       error = dlm_dir_lookup(ls, nodeid, rc_in->rc_buf, len, &ret_nodeid);
+       if (error)
+               ret_nodeid = error;
+       rc->rc_result = ret_nodeid;
+       rc->rc_id = rc_in->rc_id;
+
+       send_rcom(ls, mh, rc);
+}
+
+static void receive_rcom_lookup_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+       dlm_recover_master_reply(ls, rc_in);
+}
+
+static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb,
+                          struct rcom_lock *rl)
+{
+       memset(rl, 0, sizeof(*rl));
+
+       rl->rl_ownpid = lkb->lkb_ownpid;
+       rl->rl_lkid = lkb->lkb_id;
+       rl->rl_exflags = lkb->lkb_exflags;
+       rl->rl_flags = lkb->lkb_flags;
+       rl->rl_lvbseq = lkb->lkb_lvbseq;
+       rl->rl_rqmode = lkb->lkb_rqmode;
+       rl->rl_grmode = lkb->lkb_grmode;
+       rl->rl_status = lkb->lkb_status;
+       rl->rl_wait_type = lkb->lkb_wait_type;
+
+       if (lkb->lkb_bastaddr)
+               rl->rl_asts |= AST_BAST;
+       if (lkb->lkb_astaddr)
+               rl->rl_asts |= AST_COMP;
+
+       if (lkb->lkb_range)
+               memcpy(rl->rl_range, lkb->lkb_range, 4*sizeof(uint64_t));
+
+       rl->rl_namelen = r->res_length;
+       memcpy(rl->rl_name, r->res_name, r->res_length);
+
+       /* FIXME: might we have an lvb without DLM_LKF_VALBLK set ?
+          If so, receive_rcom_lock_args() won't take this copy. */
+
+       if (lkb->lkb_lvbptr)
+               memcpy(rl->rl_lvb, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
+}
+
+int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+       struct dlm_ls *ls = r->res_ls;
+       struct dlm_rcom *rc;
+       struct dlm_mhandle *mh;
+       struct rcom_lock *rl;
+       int error, len = sizeof(struct rcom_lock);
+
+       if (lkb->lkb_lvbptr)
+               len += ls->ls_lvblen;
+
+       error = create_rcom(ls, r->res_nodeid, DLM_RCOM_LOCK, len, &rc, &mh);
+       if (error)
+               goto out;
+
+       rl = (struct rcom_lock *) rc->rc_buf;
+       pack_rcom_lock(r, lkb, rl);
+       rc->rc_id = (unsigned long) r;
+
+       send_rcom(ls, mh, rc);
+ out:
+       return error;
+}
+
+static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+       struct dlm_rcom *rc;
+       struct dlm_mhandle *mh;
+       int error, nodeid = rc_in->rc_header.h_nodeid;
+
+       dlm_recover_master_copy(ls, rc_in);
+
+       error = create_rcom(ls, nodeid, DLM_RCOM_LOCK_REPLY,
+                           sizeof(struct rcom_lock), &rc, &mh);
+       if (error)
+               return;
+
+       /* We send back the same rcom_lock struct we received, but
+          dlm_recover_master_copy() has filled in rl_remid and rl_result */
+
+       memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock));
+       rc->rc_id = rc_in->rc_id;
+
+       send_rcom(ls, mh, rc);
+}
+
+static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+       uint32_t status = dlm_recover_status(ls);
+
+       if (!(status & DLM_RS_DIR)) {
+               log_debug(ls, "ignoring RCOM_LOCK_REPLY from %u",
+                         rc_in->rc_header.h_nodeid);
+               return;
+       }
+
+       dlm_recover_process_copy(ls, rc_in);
+}
+
+static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
+{
+       struct dlm_rcom *rc;
+       struct dlm_mhandle *mh;
+       char *mb;
+       int mb_len = sizeof(struct dlm_rcom);
+
+       mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_KERNEL, &mb);
+       if (!mh)
+               return -ENOBUFS;
+       memset(mb, 0, mb_len);
+
+       rc = (struct dlm_rcom *) mb;
+
+       rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+       rc->rc_header.h_lockspace = rc_in->rc_header.h_lockspace;
+       rc->rc_header.h_nodeid = dlm_our_nodeid();
+       rc->rc_header.h_length = mb_len;
+       rc->rc_header.h_cmd = DLM_RCOM;
+
+       rc->rc_type = DLM_RCOM_STATUS_REPLY;
+       rc->rc_result = -ESRCH;
+
+       dlm_rcom_out(rc);
+       dlm_lowcomms_commit_buffer(mh);
+
+       return 0;
+}
+
+/* Called by dlm_recvd; corresponds to dlm_receive_message() but special
+   recovery-only comms are sent through here. */
+
+void dlm_receive_rcom(struct dlm_header *hd, int nodeid)
+{
+       struct dlm_rcom *rc = (struct dlm_rcom *) hd;
+       struct dlm_ls *ls;
+
+       dlm_rcom_in(rc);
+
+       /* If the lockspace doesn't exist then still send a status message
+          back; it's possible that it just doesn't have its global_id yet. */
+
+       ls = dlm_find_lockspace_global(hd->h_lockspace);
+       if (!ls) {
+               log_print("lockspace %x from %d not found",
+                         hd->h_lockspace, nodeid);
+               send_ls_not_ready(nodeid, rc);
+               return;
+       }
+
+       if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) {
+               log_error(ls, "ignoring recovery message %x from %d",
+                         rc->rc_type, nodeid);
+               goto out;
+       }
+
+       if (nodeid != rc->rc_header.h_nodeid) {
+               log_error(ls, "bad rcom nodeid %d from %d",
+                         rc->rc_header.h_nodeid, nodeid);
+               goto out;
+       }
+
+       switch (rc->rc_type) {
+       case DLM_RCOM_STATUS:
+               receive_rcom_status(ls, rc);
+               break;
+
+       case DLM_RCOM_NAMES:
+               receive_rcom_names(ls, rc);
+               break;
+
+       case DLM_RCOM_LOOKUP:
+               receive_rcom_lookup(ls, rc);
+               break;
+
+       case DLM_RCOM_LOCK:
+               receive_rcom_lock(ls, rc);
+               break;
+
+       case DLM_RCOM_STATUS_REPLY:
+               receive_rcom_status_reply(ls, rc);
+               break;
+
+       case DLM_RCOM_NAMES_REPLY:
+               receive_rcom_names_reply(ls, rc);
+               break;
+
+       case DLM_RCOM_LOOKUP_REPLY:
+               receive_rcom_lookup_reply(ls, rc);
+               break;
+
+       case DLM_RCOM_LOCK_REPLY:
+               receive_rcom_lock_reply(ls, rc);
+               break;
+
+       default:
+               DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type););
+       }
+ out:
+       dlm_put_lockspace(ls);
+}
+
diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h
new file mode 100644 (file)
index 0000000..d798432
--- /dev/null
@@ -0,0 +1,24 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __RCOM_DOT_H__
+#define __RCOM_DOT_H__
+
+int dlm_rcom_status(struct dlm_ls *ls, int nodeid);
+int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len);
+int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid);
+int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+void dlm_receive_rcom(struct dlm_header *hd, int nodeid);
+
+#endif
+
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
new file mode 100644 (file)
index 0000000..1712c97
--- /dev/null
@@ -0,0 +1,762 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "dir.h"
+#include "config.h"
+#include "ast.h"
+#include "memory.h"
+#include "rcom.h"
+#include "lock.h"
+#include "lowcomms.h"
+#include "member.h"
+#include "recover.h"
+
+
+/*
+ * Recovery waiting routines: these functions wait for a particular reply from
+ * a remote node, or for the remote node to report a certain status.  They need
+ * to abort if the lockspace is stopped indicating a node has failed (perhaps
+ * the one being waited for).
+ */
+
+/*
+ * Wait until given function returns non-zero or lockspace is stopped
+ * (LS_RECOVERY_STOP set due to failure of a node in ls_nodes).  When another
+ * function thinks it could have completed the waited-on task, they should wake
+ * up ls_wait_general to get an immediate response rather than waiting for the
+ * timer to detect the result.  A timer wakes us up periodically while waiting
+ * to see if we should abort due to a node failure.  This should only be called
+ * by the dlm_recoverd thread.
+ */
+
+static void dlm_wait_timer_fn(unsigned long data)
+{
+       struct dlm_ls *ls = (struct dlm_ls *) data;
+       mod_timer(&ls->ls_timer, jiffies + (dlm_config.recover_timer * HZ));
+       wake_up(&ls->ls_wait_general);
+}
+
+int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls))
+{
+       int error = 0;
+
+       init_timer(&ls->ls_timer);
+       ls->ls_timer.function = dlm_wait_timer_fn;
+       ls->ls_timer.data = (long) ls;
+       ls->ls_timer.expires = jiffies + (dlm_config.recover_timer * HZ);
+       add_timer(&ls->ls_timer);
+
+       wait_event(ls->ls_wait_general, testfn(ls) || dlm_recovery_stopped(ls));
+       del_timer_sync(&ls->ls_timer);
+
+       if (dlm_recovery_stopped(ls)) {
+               log_debug(ls, "dlm_wait_function aborted");
+               error = -EINTR;
+       }
+       return error;
+}
+
+/*
+ * An efficient way for all nodes to wait for all others to have a certain
+ * status.  The node with the lowest nodeid polls all the others for their
+ * status (wait_status_all) and all the others poll the node with the low id
+ * for its accumulated result (wait_status_low).  When all nodes have set
+ * status flag X, then status flag X_ALL will be set on the low nodeid.
+ */
+
+uint32_t dlm_recover_status(struct dlm_ls *ls)
+{
+       uint32_t status;
+       spin_lock(&ls->ls_recover_lock);
+       status = ls->ls_recover_status;
+       spin_unlock(&ls->ls_recover_lock);
+       return status;
+}
+
+void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status)
+{
+       spin_lock(&ls->ls_recover_lock);
+       ls->ls_recover_status |= status;
+       spin_unlock(&ls->ls_recover_lock);
+}
+
+static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status)
+{
+       struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf;
+       struct dlm_member *memb;
+       int error = 0, delay;
+
+       list_for_each_entry(memb, &ls->ls_nodes, list) {
+               delay = 0;
+               for (;;) {
+                       if (dlm_recovery_stopped(ls)) {
+                               error = -EINTR;
+                               goto out;
+                       }
+
+                       error = dlm_rcom_status(ls, memb->nodeid);
+                       if (error)
+                               goto out;
+
+                       if (rc->rc_result & wait_status)
+                               break;
+                       if (delay < 1000)
+                               delay += 20;
+                       msleep(delay);
+               }
+       }
+ out:
+       return error;
+}
+
+static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status)
+{
+       struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf;
+       int error = 0, delay = 0, nodeid = ls->ls_low_nodeid;
+
+       for (;;) {
+               if (dlm_recovery_stopped(ls)) {
+                       error = -EINTR;
+                       goto out;
+               }
+
+               error = dlm_rcom_status(ls, nodeid);
+               if (error)
+                       break;
+
+               if (rc->rc_result & wait_status)
+                       break;
+               if (delay < 1000)
+                       delay += 20;
+               msleep(delay);
+       }
+ out:
+       return error;
+}
+
+static int wait_status(struct dlm_ls *ls, uint32_t status)
+{
+       uint32_t status_all = status << 1;
+       int error;
+
+       if (ls->ls_low_nodeid == dlm_our_nodeid()) {
+               error = wait_status_all(ls, status);
+               if (!error)
+                       dlm_set_recover_status(ls, status_all);
+       } else
+               error = wait_status_low(ls, status_all);
+
+       return error;
+}
+
+int dlm_recover_members_wait(struct dlm_ls *ls)
+{
+       return wait_status(ls, DLM_RS_NODES);
+}
+
+int dlm_recover_directory_wait(struct dlm_ls *ls)
+{
+       return wait_status(ls, DLM_RS_DIR);
+}
+
+int dlm_recover_locks_wait(struct dlm_ls *ls)
+{
+       return wait_status(ls, DLM_RS_LOCKS);
+}
+
+int dlm_recover_done_wait(struct dlm_ls *ls)
+{
+       return wait_status(ls, DLM_RS_DONE);
+}
+
+/*
+ * The recover_list contains all the rsb's for which we've requested the new
+ * master nodeid.  As replies are returned from the resource directories the
+ * rsb's are removed from the list.  When the list is empty we're done.
+ *
+ * The recover_list is later similarly used for all rsb's for which we've sent
+ * new lkb's and need to receive new corresponding lkid's.
+ *
+ * We use the address of the rsb struct as a simple local identifier for the
+ * rsb so we can match an rcom reply with the rsb it was sent for.
+ */
+
+static int recover_list_empty(struct dlm_ls *ls)
+{
+       int empty;
+
+       spin_lock(&ls->ls_recover_list_lock);
+       empty = list_empty(&ls->ls_recover_list);
+       spin_unlock(&ls->ls_recover_list_lock);
+
+       return empty;
+}
+
+static void recover_list_add(struct dlm_rsb *r)
+{
+       struct dlm_ls *ls = r->res_ls;
+
+       spin_lock(&ls->ls_recover_list_lock);
+       if (list_empty(&r->res_recover_list)) {
+               list_add_tail(&r->res_recover_list, &ls->ls_recover_list);
+               ls->ls_recover_list_count++;
+               dlm_hold_rsb(r);
+       }
+       spin_unlock(&ls->ls_recover_list_lock);
+}
+
+static void recover_list_del(struct dlm_rsb *r)
+{
+       struct dlm_ls *ls = r->res_ls;
+
+       spin_lock(&ls->ls_recover_list_lock);
+       list_del_init(&r->res_recover_list);
+       ls->ls_recover_list_count--;
+       spin_unlock(&ls->ls_recover_list_lock);
+
+       dlm_put_rsb(r);
+}
+
+static struct dlm_rsb *recover_list_find(struct dlm_ls *ls, uint64_t id)
+{
+       struct dlm_rsb *r = NULL;
+
+       spin_lock(&ls->ls_recover_list_lock);
+
+       list_for_each_entry(r, &ls->ls_recover_list, res_recover_list) {
+               if (id == (unsigned long) r)
+                       goto out;
+       }
+       r = NULL;
+ out:
+       spin_unlock(&ls->ls_recover_list_lock);
+       return r;
+}
+
+static void recover_list_clear(struct dlm_ls *ls)
+{
+       struct dlm_rsb *r, *s;
+
+       spin_lock(&ls->ls_recover_list_lock);
+       list_for_each_entry_safe(r, s, &ls->ls_recover_list, res_recover_list) {
+               list_del_init(&r->res_recover_list);
+               dlm_put_rsb(r);
+               ls->ls_recover_list_count--;
+       }
+
+       if (ls->ls_recover_list_count != 0) {
+               log_error(ls, "warning: recover_list_count %d",
+                         ls->ls_recover_list_count);
+               ls->ls_recover_list_count = 0;
+       }
+       spin_unlock(&ls->ls_recover_list_lock);
+}
+
+
+/* Master recovery: find new master node for rsb's that were
+   mastered on nodes that have been removed.
+
+   dlm_recover_masters
+   recover_master
+   dlm_send_rcom_lookup            ->  receive_rcom_lookup
+                                       dlm_dir_lookup
+   receive_rcom_lookup_reply       <-
+   dlm_recover_master_reply
+   set_new_master
+   set_master_lkbs
+   set_lock_master
+*/
+
+/*
+ * Set the lock master for all LKBs in a lock queue
+ * If we are the new master of the rsb, we may have received new
+ * MSTCPY locks from other nodes already which we need to ignore
+ * when setting the new nodeid.
+ */
+
+static void set_lock_master(struct list_head *queue, int nodeid)
+{
+       struct dlm_lkb *lkb;
+
+       list_for_each_entry(lkb, queue, lkb_statequeue)
+               if (!(lkb->lkb_flags & DLM_IFL_MSTCPY))
+                       lkb->lkb_nodeid = nodeid;
+}
+
+static void set_master_lkbs(struct dlm_rsb *r)
+{
+       set_lock_master(&r->res_grantqueue, r->res_nodeid);
+       set_lock_master(&r->res_convertqueue, r->res_nodeid);
+       set_lock_master(&r->res_waitqueue, r->res_nodeid);
+}
+
+/*
+ * Propogate the new master nodeid to locks
+ * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider.
+ * The NEW_MASTER2 flag tells recover_lvb() which rsb's to consider.
+ */
+
+static void set_new_master(struct dlm_rsb *r, int nodeid)
+{
+       lock_rsb(r);
+       r->res_nodeid = nodeid;
+       set_master_lkbs(r);
+       rsb_set_flag(r, RSB_NEW_MASTER);
+       rsb_set_flag(r, RSB_NEW_MASTER2);
+       unlock_rsb(r);
+}
+
+/*
+ * We do async lookups on rsb's that need new masters.  The rsb's
+ * waiting for a lookup reply are kept on the recover_list.
+ */
+
+static int recover_master(struct dlm_rsb *r)
+{
+       struct dlm_ls *ls = r->res_ls;
+       int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
+
+       dir_nodeid = dlm_dir_nodeid(r);
+
+       if (dir_nodeid == our_nodeid) {
+               error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
+                                      r->res_length, &ret_nodeid);
+               if (error)
+                       log_error(ls, "recover dir lookup error %d", error);
+
+               if (ret_nodeid == our_nodeid)
+                       ret_nodeid = 0;
+               set_new_master(r, ret_nodeid);
+       } else {
+               recover_list_add(r);
+               error = dlm_send_rcom_lookup(r, dir_nodeid);
+       }
+
+       return error;
+}
+
+/*
+ * When not using a directory, most resource names will hash to a new static
+ * master nodeid and the resource will need to be remastered.
+ */
+
+static int recover_master_static(struct dlm_rsb *r)
+{
+       int master = dlm_dir_nodeid(r);
+
+       if (master == dlm_our_nodeid())
+               master = 0;
+
+       if (r->res_nodeid != master) {
+               if (is_master(r))
+                       dlm_purge_mstcpy_locks(r);
+               set_new_master(r, master);
+               return 1;
+       }
+       return 0;
+}
+
+/*
+ * Go through local root resources and for each rsb which has a master which
+ * has departed, get the new master nodeid from the directory.  The dir will
+ * assign mastery to the first node to look up the new master.  That means
+ * we'll discover in this lookup if we're the new master of any rsb's.
+ *
+ * We fire off all the dir lookup requests individually and asynchronously to
+ * the correct dir node.
+ */
+
+int dlm_recover_masters(struct dlm_ls *ls)
+{
+       struct dlm_rsb *r;
+       int error = 0, count = 0;
+
+       log_debug(ls, "dlm_recover_masters");
+
+       down_read(&ls->ls_root_sem);
+       list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+               if (dlm_recovery_stopped(ls)) {
+                       up_read(&ls->ls_root_sem);
+                       error = -EINTR;
+                       goto out;
+               }
+
+               if (dlm_no_directory(ls))
+                       count += recover_master_static(r);
+               else if (!is_master(r) && dlm_is_removed(ls, r->res_nodeid)) {
+                       recover_master(r);
+                       count++;
+               }
+
+               schedule();
+       }
+       up_read(&ls->ls_root_sem);
+
+       log_debug(ls, "dlm_recover_masters %d resources", count);
+
+       error = dlm_wait_function(ls, &recover_list_empty);
+ out:
+       if (error)
+               recover_list_clear(ls);
+       return error;
+}
+
+int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+       struct dlm_rsb *r;
+       int nodeid;
+
+       r = recover_list_find(ls, rc->rc_id);
+       if (!r) {
+               log_error(ls, "dlm_recover_master_reply no id %"PRIx64"",
+                         rc->rc_id);
+               goto out;
+       }
+
+       nodeid = rc->rc_result;
+       if (nodeid == dlm_our_nodeid())
+               nodeid = 0;
+
+       set_new_master(r, nodeid);
+       recover_list_del(r);
+
+       if (recover_list_empty(ls))
+               wake_up(&ls->ls_wait_general);
+ out:
+       return 0;
+}
+
+
+/* Lock recovery: rebuild the process-copy locks we hold on a
+   remastered rsb on the new rsb master.
+
+   dlm_recover_locks
+   recover_locks
+   recover_locks_queue
+   dlm_send_rcom_lock              ->  receive_rcom_lock
+                                       dlm_recover_master_copy
+   receive_rcom_lock_reply         <-
+   dlm_recover_process_copy
+*/
+
+
+/*
+ * keep a count of the number of lkb's we send to the new master; when we get
+ * an equal number of replies then recovery for the rsb is done
+ */
+
+static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head)
+{
+       struct dlm_lkb *lkb;
+       int error = 0;
+
+       list_for_each_entry(lkb, head, lkb_statequeue) {
+               error = dlm_send_rcom_lock(r, lkb);
+               if (error)
+                       break;
+               r->res_recover_locks_count++;
+       }
+
+       return error;
+}
+
+static int all_queues_empty(struct dlm_rsb *r)
+{
+       if (!list_empty(&r->res_grantqueue) ||
+           !list_empty(&r->res_convertqueue) ||
+           !list_empty(&r->res_waitqueue))
+               return FALSE;
+       return TRUE;
+}
+
+static int recover_locks(struct dlm_rsb *r)
+{
+       int error = 0;
+
+       lock_rsb(r);
+       if (all_queues_empty(r))
+               goto out;
+
+       DLM_ASSERT(!r->res_recover_locks_count, dlm_print_rsb(r););
+
+       error = recover_locks_queue(r, &r->res_grantqueue);
+       if (error)
+               goto out;
+       error = recover_locks_queue(r, &r->res_convertqueue);
+       if (error)
+               goto out;
+       error = recover_locks_queue(r, &r->res_waitqueue);
+       if (error)
+               goto out;
+
+       if (r->res_recover_locks_count)
+               recover_list_add(r);
+       else
+               rsb_clear_flag(r, RSB_NEW_MASTER);
+ out:
+       unlock_rsb(r);
+       return error;
+}
+
+int dlm_recover_locks(struct dlm_ls *ls)
+{
+       struct dlm_rsb *r;
+       int error, count = 0;
+
+       log_debug(ls, "dlm_recover_locks");
+
+       down_read(&ls->ls_root_sem);
+       list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+               if (is_master(r)) {
+                       rsb_clear_flag(r, RSB_NEW_MASTER);
+                       continue;
+               }
+
+               if (!rsb_flag(r, RSB_NEW_MASTER))
+                       continue;
+
+               if (dlm_recovery_stopped(ls)) {
+                       error = -EINTR;
+                       up_read(&ls->ls_root_sem);
+                       goto out;
+               }
+
+               error = recover_locks(r);
+               if (error) {
+                       up_read(&ls->ls_root_sem);
+                       goto out;
+               }
+
+               count += r->res_recover_locks_count;
+       }
+       up_read(&ls->ls_root_sem);
+
+       log_debug(ls, "dlm_recover_locks %d locks", count);
+
+       error = dlm_wait_function(ls, &recover_list_empty);
+ out:
+       if (error)
+               recover_list_clear(ls);
+       else
+               dlm_set_recover_status(ls, DLM_RS_LOCKS);
+       return error;
+}
+
+void dlm_recovered_lock(struct dlm_rsb *r)
+{
+       DLM_ASSERT(rsb_flag(r, RSB_NEW_MASTER), dlm_print_rsb(r););
+
+       r->res_recover_locks_count--;
+       if (!r->res_recover_locks_count) {
+               rsb_clear_flag(r, RSB_NEW_MASTER);
+               recover_list_del(r);
+       }
+
+       if (recover_list_empty(r->res_ls))
+               wake_up(&r->res_ls->ls_wait_general);
+}
+
+/*
+ * The lvb needs to be recovered on all master rsb's.  This includes setting
+ * the VALNOTVALID flag if necessary, and determining the correct lvb contents
+ * based on the lvb's of the locks held on the rsb.
+ *
+ * RSB_VALNOTVALID is set if there are only NL/CR locks on the rsb.  If it
+ * was already set prior to recovery, it's not cleared, regardless of locks.
+ *
+ * The LVB contents are only considered for changing when this is a new master
+ * of the rsb (NEW_MASTER2).  Then, the rsb's lvb is taken from any lkb with
+ * mode > CR.  If no lkb's exist with mode above CR, the lvb contents are taken
+ * from the lkb with the largest lvb sequence number.
+ */
+
+static void recover_lvb(struct dlm_rsb *r)
+{
+       struct dlm_lkb *lkb, *high_lkb = NULL;
+       uint32_t high_seq = 0;
+       int lock_lvb_exists = FALSE;
+       int big_lock_exists = FALSE;
+       int lvblen = r->res_ls->ls_lvblen;
+
+       list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
+               if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+                       continue;
+
+               lock_lvb_exists = TRUE;
+
+               if (lkb->lkb_grmode > DLM_LOCK_CR) {
+                       big_lock_exists = TRUE;
+                       goto setflag;
+               }
+
+               if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
+                       high_lkb = lkb;
+                       high_seq = lkb->lkb_lvbseq;
+               }
+       }
+
+       list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
+               if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+                       continue;
+
+               lock_lvb_exists = TRUE;
+
+               if (lkb->lkb_grmode > DLM_LOCK_CR) {
+                       big_lock_exists = TRUE;
+                       goto setflag;
+               }
+
+               if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
+                       high_lkb = lkb;
+                       high_seq = lkb->lkb_lvbseq;
+               }
+       }
+
+ setflag:
+       if (!lock_lvb_exists)
+               goto out;
+
+       if (!big_lock_exists)
+               rsb_set_flag(r, RSB_VALNOTVALID);
+
+       /* don't mess with the lvb unless we're the new master */
+       if (!rsb_flag(r, RSB_NEW_MASTER2))
+               goto out;
+
+       if (!r->res_lvbptr) {
+               r->res_lvbptr = allocate_lvb(r->res_ls);
+               if (!r->res_lvbptr)
+                       goto out;
+       }
+
+       if (big_lock_exists) {
+               r->res_lvbseq = lkb->lkb_lvbseq;
+               memcpy(r->res_lvbptr, lkb->lkb_lvbptr, lvblen);
+       } else if (high_lkb) {
+               r->res_lvbseq = high_lkb->lkb_lvbseq;
+               memcpy(r->res_lvbptr, high_lkb->lkb_lvbptr, lvblen);
+       } else {
+               r->res_lvbseq = 0;
+               memset(r->res_lvbptr, 0, lvblen);
+       }
+ out:
+       return;
+}
+
+/* All master rsb's flagged RECOVER_CONVERT need to be looked at.  The locks
+   converting PR->CW or CW->PR need to have their lkb_grmode set. */
+
+static void recover_conversion(struct dlm_rsb *r)
+{
+       struct dlm_lkb *lkb;
+       int grmode = -1;
+
+       list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
+               if (lkb->lkb_grmode == DLM_LOCK_PR ||
+                   lkb->lkb_grmode == DLM_LOCK_CW) {
+                       grmode = lkb->lkb_grmode;
+                       break;
+               }
+       }
+
+       list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
+               if (lkb->lkb_grmode != DLM_LOCK_IV)
+                       continue;
+               if (grmode == -1)
+                       lkb->lkb_grmode = lkb->lkb_rqmode;
+               else
+                       lkb->lkb_grmode = grmode;
+       }
+}
+
+void dlm_recover_rsbs(struct dlm_ls *ls)
+{
+       struct dlm_rsb *r;
+       int count = 0;
+
+       log_debug(ls, "dlm_recover_rsbs");
+
+       down_read(&ls->ls_root_sem);
+       list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+               lock_rsb(r);
+               if (is_master(r)) {
+                       if (rsb_flag(r, RSB_RECOVER_CONVERT))
+                               recover_conversion(r);
+                       recover_lvb(r);
+                       count++;
+               }
+               rsb_clear_flag(r, RSB_RECOVER_CONVERT);
+               unlock_rsb(r);
+       }
+       up_read(&ls->ls_root_sem);
+
+       log_debug(ls, "dlm_recover_rsbs %d rsbs", count);
+}
+
+/* Create a single list of all root rsb's to be used during recovery */
+
+int dlm_create_root_list(struct dlm_ls *ls)
+{
+       struct dlm_rsb *r;
+       int i, error = 0;
+
+       down_write(&ls->ls_root_sem);
+       if (!list_empty(&ls->ls_root_list)) {
+               log_error(ls, "root list not empty");
+               error = -EINVAL;
+               goto out;
+       }
+
+       for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+               read_lock(&ls->ls_rsbtbl[i].lock);
+               list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
+                       list_add(&r->res_root_list, &ls->ls_root_list);
+                       dlm_hold_rsb(r);
+               }
+               read_unlock(&ls->ls_rsbtbl[i].lock);
+       }
+ out:
+       up_write(&ls->ls_root_sem);
+       return error;
+}
+
+void dlm_release_root_list(struct dlm_ls *ls)
+{
+       struct dlm_rsb *r, *safe;
+
+       down_write(&ls->ls_root_sem);
+       list_for_each_entry_safe(r, safe, &ls->ls_root_list, res_root_list) {
+               list_del_init(&r->res_root_list);
+               dlm_put_rsb(r);
+       }
+       up_write(&ls->ls_root_sem);
+}
+
+void dlm_clear_toss_list(struct dlm_ls *ls)
+{
+       struct dlm_rsb *r, *safe;
+       int i;
+
+       for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+               write_lock(&ls->ls_rsbtbl[i].lock);
+               list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
+                                        res_hashchain) {
+                       list_del(&r->res_hashchain);
+                       free_rsb(r);
+               }
+               write_unlock(&ls->ls_rsbtbl[i].lock);
+       }
+}
+
diff --git a/fs/dlm/recover.h b/fs/dlm/recover.h
new file mode 100644 (file)
index 0000000..ebd0363
--- /dev/null
@@ -0,0 +1,34 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __RECOVER_DOT_H__
+#define __RECOVER_DOT_H__
+
+int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls));
+uint32_t dlm_recover_status(struct dlm_ls *ls);
+void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status);
+int dlm_recover_members_wait(struct dlm_ls *ls);
+int dlm_recover_directory_wait(struct dlm_ls *ls);
+int dlm_recover_locks_wait(struct dlm_ls *ls);
+int dlm_recover_done_wait(struct dlm_ls *ls);
+int dlm_recover_masters(struct dlm_ls *ls);
+int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc);
+int dlm_recover_locks(struct dlm_ls *ls);
+void dlm_recovered_lock(struct dlm_rsb *r);
+int dlm_create_root_list(struct dlm_ls *ls);
+void dlm_release_root_list(struct dlm_ls *ls);
+void dlm_clear_toss_list(struct dlm_ls *ls);
+void dlm_recover_rsbs(struct dlm_ls *ls);
+
+#endif                         /* __RECOVER_DOT_H__ */
+
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
new file mode 100644 (file)
index 0000000..06e4f7c
--- /dev/null
@@ -0,0 +1,285 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "dir.h"
+#include "ast.h"
+#include "recover.h"
+#include "lowcomms.h"
+#include "lock.h"
+#include "requestqueue.h"
+#include "recoverd.h"
+
+
+/* If the start for which we're re-enabling locking (seq) has been superseded
+   by a newer stop (ls_recover_seq), we need to leave locking disabled. */
+
+static int enable_locking(struct dlm_ls *ls, uint64_t seq)
+{
+       int error = -EINTR;
+
+       spin_lock(&ls->ls_recover_lock);
+       if (ls->ls_recover_seq == seq) {
+               set_bit(LSFL_RUNNING, &ls->ls_flags);
+               up_write(&ls->ls_in_recovery);
+               error = 0;
+       }
+       spin_unlock(&ls->ls_recover_lock);
+       return error;
+}
+
+static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
+{
+       unsigned long start;
+       int error, neg = 0;
+
+       log_debug(ls, "recover %"PRIx64"", rv->seq);
+
+       down(&ls->ls_recoverd_active);
+
+       /*
+        * Suspending and resuming dlm_astd ensures that no lkb's from this ls
+        * will be processed by dlm_astd during recovery.
+        */
+
+       dlm_astd_suspend();
+       dlm_astd_resume();
+
+       /*
+        * This list of root rsb's will be the basis of most of the recovery
+        * routines.
+        */
+
+       dlm_create_root_list(ls);
+
+       /*
+        * Free all the tossed rsb's so we don't have to recover them.
+        */
+
+       dlm_clear_toss_list(ls);
+
+       /*
+        * Add or remove nodes from the lockspace's ls_nodes list.
+        * Also waits for all nodes to complete dlm_recover_members.
+        */
+
+       error = dlm_recover_members(ls, rv, &neg);
+       if (error) {
+               log_error(ls, "recover_members failed %d", error);
+               goto fail;
+       }
+       start = jiffies;
+
+       /*
+        * Rebuild our own share of the directory by collecting from all other
+        * nodes their master rsb names that hash to us.
+        */
+
+       error = dlm_recover_directory(ls);
+       if (error) {
+               log_error(ls, "recover_directory failed %d", error);
+               goto fail;
+       }
+
+       /*
+        * Purge directory-related requests that are saved in requestqueue.
+        * All dir requests from before recovery are invalid now due to the dir
+        * rebuild and will be resent by the requesting nodes.
+        */
+
+       dlm_purge_requestqueue(ls);
+
+       /*
+        * Wait for all nodes to complete directory rebuild.
+        */
+
+       error = dlm_recover_directory_wait(ls);
+       if (error) {
+               log_error(ls, "recover_directory_wait failed %d", error);
+               goto fail;
+       }
+
+       /*
+        * We may have outstanding operations that are waiting for a reply from
+        * a failed node.  Mark these to be resent after recovery.  Unlock and
+        * cancel ops can just be completed.
+        */
+
+       dlm_recover_waiters_pre(ls);
+
+       error = dlm_recovery_stopped(ls);
+       if (error)
+               goto fail;
+
+       if (neg || dlm_no_directory(ls)) {
+               /*
+                * Clear lkb's for departed nodes.
+                */
+
+               dlm_purge_locks(ls);
+
+               /*
+                * Get new master nodeid's for rsb's that were mastered on
+                * departed nodes.
+                */
+
+               error = dlm_recover_masters(ls);
+               if (error) {
+                       log_error(ls, "recover_masters failed %d", error);
+                       goto fail;
+               }
+
+               /*
+                * Send our locks on remastered rsb's to the new masters.
+                */
+
+               error = dlm_recover_locks(ls);
+               if (error) {
+                       log_error(ls, "recover_locks failed %d", error);
+                       goto fail;
+               }
+
+               error = dlm_recover_locks_wait(ls);
+               if (error) {
+                       log_error(ls, "recover_locks_wait failed %d", error);
+                       goto fail;
+               }
+
+               /*
+                * Finalize state in master rsb's now that all locks can be
+                * checked.  This includes conversion resolution and lvb
+                * settings.
+                */
+
+               dlm_recover_rsbs(ls);
+       }
+
+       dlm_release_root_list(ls);
+
+       dlm_set_recover_status(ls, DLM_RS_DONE);
+       error = dlm_recover_done_wait(ls);
+       if (error) {
+               log_error(ls, "recover_done_wait failed %d", error);
+               goto fail;
+       }
+
+       dlm_clear_members_gone(ls);
+
+       error = enable_locking(ls, rv->seq);
+       if (error) {
+               log_error(ls, "enable_locking failed %d", error);
+               goto fail;
+       }
+
+       error = dlm_process_requestqueue(ls);
+       if (error) {
+               log_error(ls, "process_requestqueue failed %d", error);
+               goto fail;
+       }
+
+       error = dlm_recover_waiters_post(ls);
+       if (error) {
+               log_error(ls, "recover_waiters_post failed %d", error);
+               goto fail;
+       }
+
+       dlm_grant_after_purge(ls);
+
+       dlm_astd_wake();
+
+       log_debug(ls, "recover %"PRIx64" done: %u ms", rv->seq,
+                 jiffies_to_msecs(jiffies - start));
+       up(&ls->ls_recoverd_active);
+
+       return 0;
+
+ fail:
+       dlm_release_root_list(ls);
+       log_debug(ls, "recover %"PRIx64" error %d", rv->seq, error);
+       up(&ls->ls_recoverd_active);
+       return error;
+}
+
+static void do_ls_recovery(struct dlm_ls *ls)
+{
+       struct dlm_recover *rv = NULL;
+
+       spin_lock(&ls->ls_recover_lock);
+       rv = ls->ls_recover_args;
+       ls->ls_recover_args = NULL;
+       clear_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
+       spin_unlock(&ls->ls_recover_lock);
+
+       if (rv) {
+               ls_recover(ls, rv);
+               kfree(rv->nodeids);
+               kfree(rv);
+       }
+}
+
+static int dlm_recoverd(void *arg)
+{
+       struct dlm_ls *ls;
+
+       ls = dlm_find_lockspace_local(arg);
+
+       while (!kthread_should_stop()) {
+               set_current_state(TASK_INTERRUPTIBLE);
+               if (!test_bit(LSFL_WORK, &ls->ls_flags))
+                       schedule();
+               set_current_state(TASK_RUNNING);
+
+               if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags))
+                       do_ls_recovery(ls);
+       }
+
+       dlm_put_lockspace(ls);
+       return 0;
+}
+
+void dlm_recoverd_kick(struct dlm_ls *ls)
+{
+       set_bit(LSFL_WORK, &ls->ls_flags);
+       wake_up_process(ls->ls_recoverd_task);
+}
+
+int dlm_recoverd_start(struct dlm_ls *ls)
+{
+       struct task_struct *p;
+       int error = 0;
+
+       p = kthread_run(dlm_recoverd, ls, "dlm_recoverd");
+       if (IS_ERR(p))
+               error = PTR_ERR(p);
+       else
+                ls->ls_recoverd_task = p;
+       return error;
+}
+
+void dlm_recoverd_stop(struct dlm_ls *ls)
+{
+       kthread_stop(ls->ls_recoverd_task);
+}
+
+void dlm_recoverd_suspend(struct dlm_ls *ls)
+{
+       down(&ls->ls_recoverd_active);
+}
+
+void dlm_recoverd_resume(struct dlm_ls *ls)
+{
+       up(&ls->ls_recoverd_active);
+}
+
diff --git a/fs/dlm/recoverd.h b/fs/dlm/recoverd.h
new file mode 100644 (file)
index 0000000..866657c
--- /dev/null
@@ -0,0 +1,24 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __RECOVERD_DOT_H__
+#define __RECOVERD_DOT_H__
+
+void dlm_recoverd_kick(struct dlm_ls *ls);
+void dlm_recoverd_stop(struct dlm_ls *ls);
+int dlm_recoverd_start(struct dlm_ls *ls);
+void dlm_recoverd_suspend(struct dlm_ls *ls);
+void dlm_recoverd_resume(struct dlm_ls *ls);
+
+#endif                         /* __RECOVERD_DOT_H__ */
+
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
new file mode 100644 (file)
index 0000000..36afe99
--- /dev/null
@@ -0,0 +1,184 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "member.h"
+#include "lock.h"
+#include "dir.h"
+#include "config.h"
+#include "requestqueue.h"
+
+struct rq_entry {
+       struct list_head list;
+       int nodeid;
+       char request[1];
+};
+
+/*
+ * Requests received while the lockspace is in recovery get added to the
+ * request queue and processed when recovery is complete.  This happens when
+ * the lockspace is suspended on some nodes before it is on others, or the
+ * lockspace is enabled on some while still suspended on others.
+ */
+
+void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
+{
+       struct rq_entry *e;
+       int length = hd->h_length;
+
+       if (dlm_is_removed(ls, nodeid))
+               return;
+
+       e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
+       if (!e) {
+               log_print("dlm_add_requestqueue: out of memory\n");
+               return;
+       }
+
+       e->nodeid = nodeid;
+       memcpy(e->request, hd, length);
+
+       down(&ls->ls_requestqueue_lock);
+       list_add_tail(&e->list, &ls->ls_requestqueue);
+       up(&ls->ls_requestqueue_lock);
+}
+
+int dlm_process_requestqueue(struct dlm_ls *ls)
+{
+       struct rq_entry *e;
+       struct dlm_header *hd;
+       int error = 0;
+
+       down(&ls->ls_requestqueue_lock);
+
+       for (;;) {
+               if (list_empty(&ls->ls_requestqueue)) {
+                       up(&ls->ls_requestqueue_lock);
+                       error = 0;
+                       break;
+               }
+               e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list);
+               up(&ls->ls_requestqueue_lock);
+
+               hd = (struct dlm_header *) e->request;
+               error = dlm_receive_message(hd, e->nodeid, TRUE);
+
+               if (error == -EINTR) {
+                       /* entry is left on requestqueue */
+                       log_debug(ls, "process_requestqueue abort eintr");
+                       break;
+               }
+
+               down(&ls->ls_requestqueue_lock);
+               list_del(&e->list);
+               kfree(e);
+
+               if (dlm_locking_stopped(ls)) {
+                       log_debug(ls, "process_requestqueue abort running");
+                       up(&ls->ls_requestqueue_lock);
+                       error = -EINTR;
+                       break;
+               }
+               schedule();
+       }
+
+       return error;
+}
+
+/*
+ * After recovery is done, locking is resumed and dlm_recoverd takes all the
+ * saved requests and processes them as they would have been by dlm_recvd.  At
+ * the same time, dlm_recvd will start receiving new requests from remote
+ * nodes.  We want to delay dlm_recvd processing new requests until
+ * dlm_recoverd has finished processing the old saved requests.
+ */
+
+void dlm_wait_requestqueue(struct dlm_ls *ls)
+{
+       for (;;) {
+               down(&ls->ls_requestqueue_lock);
+               if (list_empty(&ls->ls_requestqueue))
+                       break;
+               if (dlm_locking_stopped(ls))
+                       break;
+               up(&ls->ls_requestqueue_lock);
+               schedule();
+       }
+       up(&ls->ls_requestqueue_lock);
+}
+
+static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid)
+{
+       uint32_t type = ms->m_type;
+
+       if (dlm_is_removed(ls, nodeid))
+               return 1;
+
+       /* directory operations are always purged because the directory is
+          always rebuilt during recovery and the lookups resent */
+
+       if (type == DLM_MSG_REMOVE ||
+           type == DLM_MSG_LOOKUP ||
+           type == DLM_MSG_LOOKUP_REPLY)
+               return 1;
+
+       if (!dlm_no_directory(ls))
+               return 0;
+
+       /* with no directory, the master is likely to change as a part of
+          recovery; requests to/from the defunct master need to be purged */
+
+       switch (type) {
+       case DLM_MSG_REQUEST:
+       case DLM_MSG_CONVERT:
+       case DLM_MSG_UNLOCK:
+       case DLM_MSG_CANCEL:
+               /* we're no longer the master of this resource, the sender
+                  will resend to the new master (see waiter_needs_recovery) */
+
+               if (dlm_hash2nodeid(ls, ms->m_hash) != dlm_our_nodeid())
+                       return 1;
+               break;
+
+       case DLM_MSG_REQUEST_REPLY:
+       case DLM_MSG_CONVERT_REPLY:
+       case DLM_MSG_UNLOCK_REPLY:
+       case DLM_MSG_CANCEL_REPLY:
+       case DLM_MSG_GRANT:
+               /* this reply is from the former master of the resource,
+                  we'll resend to the new master if needed */
+
+               if (dlm_hash2nodeid(ls, ms->m_hash) != nodeid)
+                       return 1;
+               break;
+       }
+
+       return 0;
+}
+
+void dlm_purge_requestqueue(struct dlm_ls *ls)
+{
+       struct dlm_message *ms;
+       struct rq_entry *e, *safe;
+
+       down(&ls->ls_requestqueue_lock);
+       list_for_each_entry_safe(e, safe, &ls->ls_requestqueue, list) {
+               ms = (struct dlm_message *) e->request;
+
+               if (purge_request(ls, ms, e->nodeid)) {
+                       list_del(&e->list);
+                       kfree(e);
+               }
+       }
+       up(&ls->ls_requestqueue_lock);
+}
+
diff --git a/fs/dlm/requestqueue.h b/fs/dlm/requestqueue.h
new file mode 100644 (file)
index 0000000..349f0d2
--- /dev/null
@@ -0,0 +1,22 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __REQUESTQUEUE_DOT_H__
+#define __REQUESTQUEUE_DOT_H__
+
+void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd);
+int dlm_process_requestqueue(struct dlm_ls *ls);
+void dlm_wait_requestqueue(struct dlm_ls *ls);
+void dlm_purge_requestqueue(struct dlm_ls *ls);
+
+#endif
+
diff --git a/fs/dlm/util.c b/fs/dlm/util.c
new file mode 100644 (file)
index 0000000..826d122
--- /dev/null
@@ -0,0 +1,173 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "rcom.h"
+#include "util.h"
+
+static void header_out(struct dlm_header *hd)
+{
+       hd->h_version           = cpu_to_le32(hd->h_version);
+       hd->h_lockspace         = cpu_to_le32(hd->h_lockspace);
+       hd->h_nodeid            = cpu_to_le32(hd->h_nodeid);
+       hd->h_length            = cpu_to_le16(hd->h_length);
+}
+
+static void header_in(struct dlm_header *hd)
+{
+       hd->h_version           = le32_to_cpu(hd->h_version);
+       hd->h_lockspace         = le32_to_cpu(hd->h_lockspace);
+       hd->h_nodeid            = le32_to_cpu(hd->h_nodeid);
+       hd->h_length            = le16_to_cpu(hd->h_length);
+}
+
+void dlm_message_out(struct dlm_message *ms)
+{
+       struct dlm_header *hd = (struct dlm_header *) ms;
+
+       header_out(hd);
+
+       ms->m_type              = cpu_to_le32(ms->m_type);
+       ms->m_nodeid            = cpu_to_le32(ms->m_nodeid);
+       ms->m_pid               = cpu_to_le32(ms->m_pid);
+       ms->m_lkid              = cpu_to_le32(ms->m_lkid);
+       ms->m_remid             = cpu_to_le32(ms->m_remid);
+       ms->m_parent_lkid       = cpu_to_le32(ms->m_parent_lkid);
+       ms->m_parent_remid      = cpu_to_le32(ms->m_parent_remid);
+       ms->m_exflags           = cpu_to_le32(ms->m_exflags);
+       ms->m_sbflags           = cpu_to_le32(ms->m_sbflags);
+       ms->m_flags             = cpu_to_le32(ms->m_flags);
+       ms->m_lvbseq            = cpu_to_le32(ms->m_lvbseq);
+       ms->m_hash              = cpu_to_le32(ms->m_hash);
+       ms->m_status            = cpu_to_le32(ms->m_status);
+       ms->m_grmode            = cpu_to_le32(ms->m_grmode);
+       ms->m_rqmode            = cpu_to_le32(ms->m_rqmode);
+       ms->m_bastmode          = cpu_to_le32(ms->m_bastmode);
+       ms->m_asts              = cpu_to_le32(ms->m_asts);
+       ms->m_result            = cpu_to_le32(ms->m_result);
+       ms->m_range[0]          = cpu_to_le64(ms->m_range[0]);
+       ms->m_range[1]          = cpu_to_le64(ms->m_range[1]);
+}
+
+void dlm_message_in(struct dlm_message *ms)
+{
+       struct dlm_header *hd = (struct dlm_header *) ms;
+
+       header_in(hd);
+
+       ms->m_type              = le32_to_cpu(ms->m_type);
+       ms->m_nodeid            = le32_to_cpu(ms->m_nodeid);
+       ms->m_pid               = le32_to_cpu(ms->m_pid);
+       ms->m_lkid              = le32_to_cpu(ms->m_lkid);
+       ms->m_remid             = le32_to_cpu(ms->m_remid);
+       ms->m_parent_lkid       = le32_to_cpu(ms->m_parent_lkid);
+       ms->m_parent_remid      = le32_to_cpu(ms->m_parent_remid);
+       ms->m_exflags           = le32_to_cpu(ms->m_exflags);
+       ms->m_sbflags           = le32_to_cpu(ms->m_sbflags);
+       ms->m_flags             = le32_to_cpu(ms->m_flags);
+       ms->m_lvbseq            = le32_to_cpu(ms->m_lvbseq);
+       ms->m_hash              = le32_to_cpu(ms->m_hash);
+       ms->m_status            = le32_to_cpu(ms->m_status);
+       ms->m_grmode            = le32_to_cpu(ms->m_grmode);
+       ms->m_rqmode            = le32_to_cpu(ms->m_rqmode);
+       ms->m_bastmode          = le32_to_cpu(ms->m_bastmode);
+       ms->m_asts              = le32_to_cpu(ms->m_asts);
+       ms->m_result            = le32_to_cpu(ms->m_result);
+       ms->m_range[0]          = le64_to_cpu(ms->m_range[0]);
+       ms->m_range[1]          = le64_to_cpu(ms->m_range[1]);
+}
+
+static void rcom_lock_out(struct rcom_lock *rl)
+{
+       rl->rl_ownpid           = cpu_to_le32(rl->rl_ownpid);
+       rl->rl_lkid             = cpu_to_le32(rl->rl_lkid);
+       rl->rl_remid            = cpu_to_le32(rl->rl_remid);
+       rl->rl_parent_lkid      = cpu_to_le32(rl->rl_parent_lkid);
+       rl->rl_parent_remid     = cpu_to_le32(rl->rl_parent_remid);
+       rl->rl_exflags          = cpu_to_le32(rl->rl_exflags);
+       rl->rl_flags            = cpu_to_le32(rl->rl_flags);
+       rl->rl_lvbseq           = cpu_to_le32(rl->rl_lvbseq);
+       rl->rl_result           = cpu_to_le32(rl->rl_result);
+       rl->rl_wait_type        = cpu_to_le16(rl->rl_wait_type);
+       rl->rl_namelen          = cpu_to_le16(rl->rl_namelen);
+       rl->rl_range[0]         = cpu_to_le64(rl->rl_range[0]);
+       rl->rl_range[1]         = cpu_to_le64(rl->rl_range[1]);
+       rl->rl_range[2]         = cpu_to_le64(rl->rl_range[2]);
+       rl->rl_range[3]         = cpu_to_le64(rl->rl_range[3]);
+}
+
+static void rcom_lock_in(struct rcom_lock *rl)
+{
+       rl->rl_ownpid           = le32_to_cpu(rl->rl_ownpid);
+       rl->rl_lkid             = le32_to_cpu(rl->rl_lkid);
+       rl->rl_remid            = le32_to_cpu(rl->rl_remid);
+       rl->rl_parent_lkid      = le32_to_cpu(rl->rl_parent_lkid);
+       rl->rl_parent_remid     = le32_to_cpu(rl->rl_parent_remid);
+       rl->rl_exflags          = le32_to_cpu(rl->rl_exflags);
+       rl->rl_flags            = le32_to_cpu(rl->rl_flags);
+       rl->rl_lvbseq           = le32_to_cpu(rl->rl_lvbseq);
+       rl->rl_result           = le32_to_cpu(rl->rl_result);
+       rl->rl_wait_type        = le16_to_cpu(rl->rl_wait_type);
+       rl->rl_namelen          = le16_to_cpu(rl->rl_namelen);
+       rl->rl_range[0]         = le64_to_cpu(rl->rl_range[0]);
+       rl->rl_range[1]         = le64_to_cpu(rl->rl_range[1]);
+       rl->rl_range[2]         = le64_to_cpu(rl->rl_range[2]);
+       rl->rl_range[3]         = le64_to_cpu(rl->rl_range[3]);
+}
+
+static void rcom_config_out(struct rcom_config *rf)
+{
+       rf->rf_lvblen           = cpu_to_le32(rf->rf_lvblen);
+       rf->rf_lsflags          = cpu_to_le32(rf->rf_lsflags);
+}
+
+static void rcom_config_in(struct rcom_config *rf)
+{
+       rf->rf_lvblen           = le32_to_cpu(rf->rf_lvblen);
+       rf->rf_lsflags          = le32_to_cpu(rf->rf_lsflags);
+}
+
+void dlm_rcom_out(struct dlm_rcom *rc)
+{
+       struct dlm_header *hd = (struct dlm_header *) rc;
+       int type = rc->rc_type;
+
+       header_out(hd);
+
+       rc->rc_type             = cpu_to_le32(rc->rc_type);
+       rc->rc_result           = cpu_to_le32(rc->rc_result);
+       rc->rc_id               = cpu_to_le64(rc->rc_id);
+
+       if (type == DLM_RCOM_LOCK)
+               rcom_lock_out((struct rcom_lock *) rc->rc_buf);
+
+       else if (type == DLM_RCOM_STATUS_REPLY)
+               rcom_config_out((struct rcom_config *) rc->rc_buf);
+}
+
+void dlm_rcom_in(struct dlm_rcom *rc)
+{
+       struct dlm_header *hd = (struct dlm_header *) rc;
+
+       header_in(hd);
+
+       rc->rc_type             = le32_to_cpu(rc->rc_type);
+       rc->rc_result           = le32_to_cpu(rc->rc_result);
+       rc->rc_id               = le64_to_cpu(rc->rc_id);
+
+       if (rc->rc_type == DLM_RCOM_LOCK)
+               rcom_lock_in((struct rcom_lock *) rc->rc_buf);
+
+       else if (rc->rc_type == DLM_RCOM_STATUS_REPLY)
+               rcom_config_in((struct rcom_config *) rc->rc_buf);
+}
+
diff --git a/fs/dlm/util.h b/fs/dlm/util.h
new file mode 100644 (file)
index 0000000..2b25915
--- /dev/null
@@ -0,0 +1,22 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __UTIL_DOT_H__
+#define __UTIL_DOT_H__
+
+void dlm_message_out(struct dlm_message *ms);
+void dlm_message_in(struct dlm_message *ms);
+void dlm_rcom_out(struct dlm_rcom *rc);
+void dlm_rcom_in(struct dlm_rcom *rc);
+
+#endif
+
diff --git a/include/linux/dlm.h b/include/linux/dlm.h
new file mode 100644 (file)
index 0000000..dd324ba
--- /dev/null
@@ -0,0 +1,312 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __DLM_DOT_H__
+#define __DLM_DOT_H__
+
+/*
+ * Interface to Distributed Lock Manager (DLM)
+ * routines and structures to use DLM lockspaces
+ */
+
+/*
+ * Lock Modes
+ */
+
+#define DLM_LOCK_IV            -1      /* invalid */
+#define DLM_LOCK_NL            0       /* null */
+#define DLM_LOCK_CR            1       /* concurrent read */
+#define DLM_LOCK_CW            2       /* concurrent write */
+#define DLM_LOCK_PR            3       /* protected read */
+#define DLM_LOCK_PW            4       /* protected write */
+#define DLM_LOCK_EX            5       /* exclusive */
+
+/*
+ * Maximum size in bytes of a dlm_lock name
+ */
+
+#define DLM_RESNAME_MAXLEN     64
+
+/*
+ * Flags to dlm_lock
+ *
+ * DLM_LKF_NOQUEUE
+ *
+ * Do not queue the lock request on the wait queue if it cannot be granted
+ * immediately.  If the lock cannot be granted because of this flag, DLM will
+ * either return -EAGAIN from the dlm_lock call or will return 0 from
+ * dlm_lock and -EAGAIN in the lock status block when the AST is executed.
+ *
+ * DLM_LKF_CANCEL
+ *
+ * Used to cancel a pending lock request or conversion.  A converting lock is
+ * returned to its previously granted mode.
+ *
+ * DLM_LKF_CONVERT
+ *
+ * Indicates a lock conversion request.  For conversions the name and namelen
+ * are ignored and the lock ID in the LKSB is used to identify the lock.
+ *
+ * DLM_LKF_VALBLK
+ *
+ * Requests DLM to return the current contents of the lock value block in the
+ * lock status block.  When this flag is set in a lock conversion from PW or EX
+ * modes, DLM assigns the value specified in the lock status block to the lock
+ * value block of the lock resource.  The LVB is a DLM_LVB_LEN size array
+ * containing application-specific information.
+ *
+ * DLM_LKF_QUECVT
+ *
+ * Force a conversion request to be queued, even if it is compatible with
+ * the granted modes of other locks on the same resource.
+ *
+ * DLM_LKF_IVVALBLK
+ *
+ * Invalidate the lock value block.
+ *
+ * DLM_LKF_CONVDEADLK
+ *
+ * Allows the dlm to resolve conversion deadlocks internally by demoting the
+ * granted mode of a converting lock to NL.  The DLM_SBF_DEMOTED flag is
+ * returned for a conversion that's been effected by this.
+ *
+ * DLM_LKF_PERSISTENT
+ *
+ * Only relevant to locks originating in userspace.  A persistent lock will not
+ * be removed if the process holding the lock exits.
+ *
+ * DLM_LKF_NODLKWT
+ * DLM_LKF_NODLCKBLK
+ *
+ * net yet implemented
+ *
+ * DLM_LKF_EXPEDITE
+ *
+ * Used only with new requests for NL mode locks.  Tells the lock manager
+ * to grant the lock, ignoring other locks in convert and wait queues.
+ *
+ * DLM_LKF_NOQUEUEBAST
+ *
+ * Send blocking AST's before returning -EAGAIN to the caller.  It is only
+ * used along with the NOQUEUE flag.  Blocking AST's are not sent for failed
+ * NOQUEUE requests otherwise.
+ *
+ * DLM_LKF_HEADQUE
+ *
+ * Add a lock to the head of the convert or wait queue rather than the tail.
+ *
+ * DLM_LKF_NOORDER
+ *
+ * Disregard the standard grant order rules and grant a lock as soon as it
+ * is compatible with other granted locks.
+ *
+ * DLM_LKF_ORPHAN
+ *
+ * not yet implemented
+ *
+ * DLM_LKF_ALTPR
+ *
+ * If the requested mode cannot be granted immediately, try to grant the lock
+ * in PR mode instead.  If this alternate mode is granted instead of the
+ * requested mode, DLM_SBF_ALTMODE is returned in the lksb.
+ *
+ * DLM_LKF_ALTCW
+ *
+ * The same as ALTPR, but the alternate mode is CW.
+ *
+ * DLM_LKF_FORCEUNLOCK
+ *
+ * Unlock the lock even if it is converting or waiting or has sublocks.
+ * Only really for use by the userland device.c code.
+ *
+ */
+
+#define DLM_LKF_NOQUEUE                0x00000001
+#define DLM_LKF_CANCEL         0x00000002
+#define DLM_LKF_CONVERT                0x00000004
+#define DLM_LKF_VALBLK         0x00000008
+#define DLM_LKF_QUECVT         0x00000010
+#define DLM_LKF_IVVALBLK       0x00000020
+#define DLM_LKF_CONVDEADLK     0x00000040
+#define DLM_LKF_PERSISTENT     0x00000080
+#define DLM_LKF_NODLCKWT       0x00000100
+#define DLM_LKF_NODLCKBLK      0x00000200
+#define DLM_LKF_EXPEDITE       0x00000400
+#define DLM_LKF_NOQUEUEBAST    0x00000800
+#define DLM_LKF_HEADQUE                0x00001000
+#define DLM_LKF_NOORDER                0x00002000
+#define DLM_LKF_ORPHAN         0x00004000
+#define DLM_LKF_ALTPR          0x00008000
+#define DLM_LKF_ALTCW          0x00010000
+#define DLM_LKF_FORCEUNLOCK    0x00020000
+
+/*
+ * Some return codes that are not in errno.h
+ */
+
+#define DLM_ECANCEL            0x10001
+#define DLM_EUNLOCK            0x10002
+
+typedef void dlm_lockspace_t;
+
+/*
+ * Lock range structure
+ */
+
+struct dlm_range {
+       uint64_t ra_start;
+       uint64_t ra_end;
+};
+
+/*
+ * Lock status block
+ *
+ * Use this structure to specify the contents of the lock value block.  For a
+ * conversion request, this structure is used to specify the lock ID of the
+ * lock.  DLM writes the status of the lock request and the lock ID assigned
+ * to the request in the lock status block.
+ *
+ * sb_lkid: the returned lock ID.  It is set on new (non-conversion) requests.
+ * It is available when dlm_lock returns.
+ *
+ * sb_lvbptr: saves or returns the contents of the lock's LVB according to rules
+ * shown for the DLM_LKF_VALBLK flag.
+ *
+ * sb_flags: DLM_SBF_DEMOTED is returned if in the process of promoting a lock,
+ * it was first demoted to NL to avoid conversion deadlock.
+ * DLM_SBF_VALNOTVALID is returned if the resource's LVB is marked invalid.
+ *
+ * sb_status: the returned status of the lock request set prior to AST
+ * execution.  Possible return values:
+ *
+ * 0 if lock request was successful
+ * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
+ * -ENOMEM if there is no memory to process request
+ * -EINVAL if there are invalid parameters
+ * -DLM_EUNLOCK if unlock request was successful
+ * -DLM_ECANCEL if a cancel completed successfully
+ */
+
+#define DLM_SBF_DEMOTED                0x01
+#define DLM_SBF_VALNOTVALID    0x02
+#define DLM_SBF_ALTMODE                0x04
+
+struct dlm_lksb {
+       int      sb_status;
+       uint32_t sb_lkid;
+       char     sb_flags;
+       char *   sb_lvbptr;
+};
+
+
+#ifdef __KERNEL__
+
+#define DLM_LSFL_NODIR         0x00000001
+
+/*
+ * dlm_new_lockspace
+ *
+ * Starts a lockspace with the given name.  If the named lockspace exists in
+ * the cluster, the calling node joins it.
+ */
+
+int dlm_new_lockspace(char *name, int namelen, dlm_lockspace_t **lockspace,
+                     uint32_t flags, int lvblen);
+
+/*
+ * dlm_release_lockspace
+ *
+ * Stop a lockspace.
+ */
+
+int dlm_release_lockspace(dlm_lockspace_t *lockspace, int force);
+
+/*
+ * dlm_lock
+ *
+ * Make an asyncronous request to acquire or convert a lock on a named
+ * resource.
+ *
+ * lockspace: context for the request
+ * mode: the requested mode of the lock (DLM_LOCK_)
+ * lksb: lock status block for input and async return values
+ * flags: input flags (DLM_LKF_)
+ * name: name of the resource to lock, can be binary
+ * namelen: the length in bytes of the resource name (MAX_RESNAME_LEN)
+ * parent: the lock ID of a parent lock or 0 if none
+ * lockast: function DLM executes when it completes processing the request
+ * astarg: argument passed to lockast and bast functions
+ * bast: function DLM executes when this lock later blocks another request
+ *
+ * Returns:
+ * 0 if request is successfully queued for processing
+ * -EINVAL if any input parameters are invalid
+ * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
+ * -ENOMEM if there is no memory to process request
+ * -ENOTCONN if there is a communication error
+ *
+ * If the call to dlm_lock returns an error then the operation has failed and
+ * the AST routine will not be called.  If dlm_lock returns 0 it is still
+ * possible that the lock operation will fail. The AST routine will be called
+ * when the locking is complete and the status is returned in the lksb.
+ *
+ * If the AST routines or parameter are passed to a conversion operation then
+ * they will overwrite those values that were passed to a previous dlm_lock
+ * call.
+ *
+ * AST routines should not block (at least not for long), but may make
+ * any locking calls they please.
+ */
+
+int dlm_lock(dlm_lockspace_t *lockspace,
+            int mode,
+            struct dlm_lksb *lksb,
+            uint32_t flags,
+            void *name,
+            unsigned int namelen,
+            uint32_t parent_lkid,
+            void (*lockast) (void *astarg),
+            void *astarg,
+            void (*bast) (void *astarg, int mode),
+            struct dlm_range *range);
+
+/*
+ * dlm_unlock
+ *
+ * Asynchronously release a lock on a resource.  The AST routine is called
+ * when the resource is successfully unlocked.
+ *
+ * lockspace: context for the request
+ * lkid: the lock ID as returned in the lksb
+ * flags: input flags (DLM_LKF_)
+ * lksb: if NULL the lksb parameter passed to last lock request is used
+ * astarg: the arg used with the completion ast for the unlock
+ *
+ * Returns:
+ * 0 if request is successfully queued for processing
+ * -EINVAL if any input parameters are invalid
+ * -ENOTEMPTY if the lock still has sublocks
+ * -EBUSY if the lock is waiting for a remote lock operation
+ * -ENOTCONN if there is a communication error
+ */
+
+int dlm_unlock(dlm_lockspace_t *lockspace,
+              uint32_t lkid,
+              uint32_t flags,
+              struct dlm_lksb *lksb,
+              void *astarg);
+
+#endif                         /* __KERNEL__ */
+
+#endif                         /* __DLM_DOT_H__ */
+
diff --git a/include/linux/dlm_device.h b/include/linux/dlm_device.h
new file mode 100644 (file)
index 0000000..5e17d29
--- /dev/null
@@ -0,0 +1,84 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/* This is the device interface for dlm, most users will use a library
+ * interface.
+ */
+
+#define DLM_USER_LVB_LEN       32
+
+/* Version of the device interface */
+#define DLM_DEVICE_VERSION_MAJOR 3
+#define DLM_DEVICE_VERSION_MINOR 0
+#define DLM_DEVICE_VERSION_PATCH 0
+
+/* struct passed to the lock write */
+struct dlm_lock_params {
+       __u8 mode;
+       __u16 flags;
+       __u32 lkid;
+       __u32 parent;
+       struct dlm_range range;
+       __u8 namelen;
+        void __user *castparam;
+       void __user *castaddr;
+       void __user *bastparam;
+        void __user *bastaddr;
+       struct dlm_lksb __user *lksb;
+       char lvb[DLM_USER_LVB_LEN];
+       char name[1];
+};
+
+struct dlm_lspace_params {
+       __u32 flags;
+       __u32 minor;
+       char name[1];
+};
+
+struct dlm_write_request {
+       __u32 version[3];
+       __u8 cmd;
+
+       union  {
+               struct dlm_lock_params   lock;
+               struct dlm_lspace_params lspace;
+       } i;
+};
+
+/* struct read from the "device" fd,
+   consists mainly of userspace pointers for the library to use */
+struct dlm_lock_result {
+       __u32 length;
+       void __user * user_astaddr;
+       void __user * user_astparam;
+       struct dlm_lksb __user * user_lksb;
+       struct dlm_lksb lksb;
+       __u8 bast_mode;
+       /* Offsets may be zero if no data is present */
+       __u32 lvb_offset;
+};
+
+/* Commands passed to the device */
+#define DLM_USER_LOCK         1
+#define DLM_USER_UNLOCK       2
+#define DLM_USER_QUERY        3
+#define DLM_USER_CREATE_LOCKSPACE  4
+#define DLM_USER_REMOVE_LOCKSPACE  5
+
+/* Arbitrary length restriction */
+#define MAX_LS_NAME_LEN 64
+
+/* Lockspace flags */
+#define DLM_USER_LSFLG_AUTOFREE   1
+#define DLM_USER_LSFLG_FORCEFREE  2
+