diff -Naur spl-0.6.5.7/cmd/Makefile.am spl-0.6.5.7.new/cmd/Makefile.am --- spl-0.6.5.7/cmd/Makefile.am 2015-12-24 01:18:07.000000000 +0100 +++ spl-0.6.5.7.new/cmd/Makefile.am 2016-08-01 16:43:23.435766048 +0200 @@ -1,11 +1 @@ -include $(top_srcdir)/config/Rules.am - -DEFAULT_INCLUDES += \ - -I$(top_srcdir)/lib - -sbin_PROGRAMS = splat - -splat_SOURCES = splat.c -splat_LDFLAGS = $(top_builddir)/lib/libcommon.la - -EXTRA_DIST = splat.h +SUBDIRS = splat splslab diff -Naur spl-0.6.5.7/cmd/splat/Makefile.am spl-0.6.5.7.new/cmd/splat/Makefile.am --- spl-0.6.5.7/cmd/splat/Makefile.am 1970-01-01 01:00:00.000000000 +0100 +++ spl-0.6.5.7.new/cmd/splat/Makefile.am 2016-08-01 16:43:23.435766048 +0200 @@ -0,0 +1,11 @@ +include $(top_srcdir)/config/Rules.am + +DEFAULT_INCLUDES += \ + -I$(top_srcdir)/lib + +sbin_PROGRAMS = splat + +splat_SOURCES = splat.c +splat_LDFLAGS = $(top_builddir)/lib/libcommon.la + +EXTRA_DIST = splat.h diff -Naur spl-0.6.5.7/cmd/splat/splat.c spl-0.6.5.7.new/cmd/splat/splat.c --- spl-0.6.5.7/cmd/splat/splat.c 1970-01-01 01:00:00.000000000 +0100 +++ spl-0.6.5.7.new/cmd/splat/splat.c 2016-08-01 16:43:23.436766051 +0200 @@ -0,0 +1,836 @@ +/*****************************************************************************\ + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + ***************************************************************************** + * Solaris Porting LAyer Tests (SPLAT) User Space Interface. +\*****************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "splat.h" + +#undef ioctl + +static const char shortOpts[] = "hvlat:xc"; +static const struct option longOpts[] = { + { "help", no_argument, 0, 'h' }, + { "verbose", no_argument, 0, 'v' }, + { "list", no_argument, 0, 'l' }, + { "all", no_argument, 0, 'a' }, + { "test", required_argument, 0, 't' }, + { "exit", no_argument, 0, 'x' }, + { "nocolor", no_argument, 0, 'c' }, + { 0, 0, 0, 0 } +}; + +#define VERSION_SIZE 64 + +static List subsystems; /* Subsystem/tests */ +static int splatctl_fd; /* Control file descriptor */ +static char splat_version[VERSION_SIZE]; /* Kernel version string */ +static char *splat_buffer = NULL; /* Scratch space area */ +static int splat_buffer_size = 0; /* Scratch space size */ + + +static void test_list(List, int); +static int dev_clear(void); +static void subsystem_fini(subsystem_t *); +static void test_fini(test_t *); + + +static int usage(void) { + fprintf(stderr, "usage: splat [hvla] [-t >]\n"); + fprintf(stderr, + " --help -h This help\n" + " --verbose -v Increase verbosity\n" + " --list -l List all tests in all subsystems\n" + " --all -a Run all tests in all subsystems\n" + " --test -t Run 'test' in subsystem 'sub'\n" + " --exit -x Exit on first test error\n" + " --nocolor -c Do not colorize output\n"); + fprintf(stderr, "\n" + "Examples:\n" + " splat -t kmem:all # Runs all kmem tests\n" + " splat -t taskq:0x201 # Run taskq test 0x201\n"); + + return 0; +} + +static subsystem_t *subsystem_init(splat_user_t *desc) +{ + subsystem_t *sub; + + sub = (subsystem_t *)malloc(sizeof(*sub)); + if (sub == NULL) + return NULL; + + memcpy(&sub->sub_desc, desc, sizeof(*desc)); + + sub->sub_tests = list_create((ListDelF)test_fini); + if (sub->sub_tests == NULL) { + free(sub); + return NULL; + } + + return sub; +} + +static void subsystem_fini(subsystem_t *sub) +{ + assert(sub != NULL); + free(sub); +} + +static int subsystem_setup(void) +{ + splat_cfg_t *cfg; + int i, rc, size, cfg_size; + subsystem_t *sub; + splat_user_t *desc; + + /* Aquire the number of registered subsystems */ + cfg_size = sizeof(*cfg); + cfg = (splat_cfg_t *)malloc(cfg_size); + if (cfg == NULL) + return -ENOMEM; + + memset(cfg, 0, cfg_size); + cfg->cfg_magic = SPLAT_CFG_MAGIC; + cfg->cfg_cmd = SPLAT_CFG_SUBSYSTEM_COUNT; + + rc = ioctl(splatctl_fd, SPLAT_CFG, cfg); + if (rc) { + fprintf(stderr, "Ioctl() error 0x%lx / %d: %d\n", + (unsigned long)SPLAT_CFG, cfg->cfg_cmd, errno); + free(cfg); + return rc; + } + + size = cfg->cfg_rc1; + free(cfg); + + /* Based on the newly acquired number of subsystems allocate + * memory to get the descriptive information for them all. */ + cfg_size = sizeof(*cfg) + size * sizeof(splat_user_t); + cfg = (splat_cfg_t *)malloc(cfg_size); + if (cfg == NULL) + return -ENOMEM; + + memset(cfg, 0, cfg_size); + cfg->cfg_magic = SPLAT_CFG_MAGIC; + cfg->cfg_cmd = SPLAT_CFG_SUBSYSTEM_LIST; + cfg->cfg_data.splat_subsystems.size = size; + + rc = ioctl(splatctl_fd, SPLAT_CFG, cfg); + if (rc) { + fprintf(stderr, "Ioctl() error %lu / %d: %d\n", + (unsigned long) SPLAT_CFG, cfg->cfg_cmd, errno); + free(cfg); + return rc; + } + + /* Add the new subsystems in to the global list */ + size = cfg->cfg_rc1; + for (i = 0; i < size; i++) { + desc = &(cfg->cfg_data.splat_subsystems.descs[i]); + + sub = subsystem_init(desc); + if (sub == NULL) { + fprintf(stderr, "Error initializing subsystem: %s\n", + desc->name); + free(cfg); + return -ENOMEM; + } + + list_append(subsystems, sub); + } + + free(cfg); + return 0; +} + +static void subsystem_list(List l, int indent) +{ + ListIterator i; + subsystem_t *sub; + + fprintf(stdout, + "------------------------------ " + "Available SPLAT Tests " + "------------------------------\n"); + + i = list_iterator_create(l); + + while ((sub = list_next(i))) { + fprintf(stdout, "%*s0x%0*x %-*s ---- %s ----\n", + indent, "", + 4, sub->sub_desc.id, + SPLAT_NAME_SIZE + 7, sub->sub_desc.name, + sub->sub_desc.desc); + test_list(sub->sub_tests, indent + 7); + } + + list_iterator_destroy(i); +} + +static test_t *test_init(subsystem_t *sub, splat_user_t *desc) +{ + test_t *test; + + test = (test_t *)malloc(sizeof(*test)); + if (test == NULL) + return NULL; + + test->test_sub = sub; + memcpy(&test->test_desc, desc, sizeof(*desc)); + + return test; +} + +static void test_fini(test_t *test) +{ + assert(test != NULL); + free(test); +} + +static int test_setup(subsystem_t *sub) +{ + splat_cfg_t *cfg; + int i, rc, size; + test_t *test; + splat_user_t *desc; + + /* Aquire the number of registered tests for the give subsystem */ + cfg = (splat_cfg_t *)malloc(sizeof(*cfg)); + if (cfg == NULL) + return -ENOMEM; + + memset(cfg, 0, sizeof(*cfg)); + cfg->cfg_magic = SPLAT_CFG_MAGIC; + cfg->cfg_cmd = SPLAT_CFG_TEST_COUNT; + cfg->cfg_arg1 = sub->sub_desc.id; /* Subsystem of interest */ + + rc = ioctl(splatctl_fd, SPLAT_CFG, cfg); + if (rc) { + fprintf(stderr, "Ioctl() error %lu / %d: %d\n", + (unsigned long) SPLAT_CFG, cfg->cfg_cmd, errno); + free(cfg); + return rc; + } + + size = cfg->cfg_rc1; + free(cfg); + + /* Based on the newly aquired number of tests allocate enough + * memory to get the descriptive information for them all. */ + cfg = (splat_cfg_t *)malloc(sizeof(*cfg) + size*sizeof(splat_user_t)); + if (cfg == NULL) + return -ENOMEM; + + memset(cfg, 0, sizeof(*cfg) + size * sizeof(splat_user_t)); + cfg->cfg_magic = SPLAT_CFG_MAGIC; + cfg->cfg_cmd = SPLAT_CFG_TEST_LIST; + cfg->cfg_arg1 = sub->sub_desc.id; /* Subsystem of interest */ + cfg->cfg_data.splat_tests.size = size; + + rc = ioctl(splatctl_fd, SPLAT_CFG, cfg); + if (rc) { + fprintf(stderr, "Ioctl() error %lu / %d: %d\n", + (unsigned long) SPLAT_CFG, cfg->cfg_cmd, errno); + free(cfg); + return rc; + } + + /* Add the new tests in to the relevant subsystems */ + size = cfg->cfg_rc1; + for (i = 0; i < size; i++) { + desc = &(cfg->cfg_data.splat_tests.descs[i]); + + test = test_init(sub, desc); + if (test == NULL) { + fprintf(stderr, "Error initializing test: %s\n", + desc->name); + free(cfg); + return -ENOMEM; + } + + list_append(sub->sub_tests, test); + } + + free(cfg); + return 0; +} + +static test_t *test_copy(test_t *test) +{ + return test_init(test->test_sub, &test->test_desc); +} + +static void test_list(List l, int indent) +{ + ListIterator i; + test_t *test; + + i = list_iterator_create(l); + + while ((test = list_next(i))) + fprintf(stdout, "%*s0x%0*x %-*s %s\n", + indent, "", 04, test->test_desc.id, + SPLAT_NAME_SIZE, test->test_desc.name, + test->test_desc.desc); + + list_iterator_destroy(i); +} + +static test_t *test_find(char *sub_str, char *test_str) +{ + ListIterator si, ti; + subsystem_t *sub; + test_t *test; + __u32 sub_num, test_num; + + /* + * No error checking here because it may not be a number, it's + * perfectly OK for it to be a string. Since we're just using + * it for comparison purposes this is all very safe. + */ + sub_num = strtoul(sub_str, NULL, 0); + test_num = strtoul(test_str, NULL, 0); + + si = list_iterator_create(subsystems); + + while ((sub = list_next(si))) { + + if (strncmp(sub->sub_desc.name, sub_str, SPLAT_NAME_SIZE) && + sub->sub_desc.id != sub_num) + continue; + + ti = list_iterator_create(sub->sub_tests); + + while ((test = list_next(ti))) { + + if (!strncmp(test->test_desc.name, test_str, + SPLAT_NAME_SIZE) || test->test_desc.id==test_num) { + list_iterator_destroy(ti); + list_iterator_destroy(si); + return test; + } + } + + list_iterator_destroy(ti); + } + + list_iterator_destroy(si); + + return NULL; +} + +static int test_add(cmd_args_t *args, test_t *test) +{ + test_t *tmp; + + tmp = test_copy(test); + if (tmp == NULL) + return -ENOMEM; + + list_append(args->args_tests, tmp); + return 0; +} + +static int test_add_all(cmd_args_t *args) +{ + ListIterator si, ti; + subsystem_t *sub; + test_t *test; + int rc; + + si = list_iterator_create(subsystems); + + while ((sub = list_next(si))) { + ti = list_iterator_create(sub->sub_tests); + + while ((test = list_next(ti))) { + if ((rc = test_add(args, test))) { + list_iterator_destroy(ti); + list_iterator_destroy(si); + return rc; + } + } + + list_iterator_destroy(ti); + } + + list_iterator_destroy(si); + + return 0; +} + +static int test_run(cmd_args_t *args, test_t *test) +{ + subsystem_t *sub = test->test_sub; + splat_cmd_t *cmd; + int rc, cmd_size; + + dev_clear(); + + cmd_size = sizeof(*cmd); + cmd = (splat_cmd_t *)malloc(cmd_size); + if (cmd == NULL) + return -ENOMEM; + + memset(cmd, 0, cmd_size); + cmd->cmd_magic = SPLAT_CMD_MAGIC; + cmd->cmd_subsystem = sub->sub_desc.id; + cmd->cmd_test = test->test_desc.id; + cmd->cmd_data_size = 0; /* Unused feature */ + + fprintf(stdout, "%*s:%-*s ", + SPLAT_NAME_SIZE, sub->sub_desc.name, + SPLAT_NAME_SIZE, test->test_desc.name); + fflush(stdout); + rc = ioctl(splatctl_fd, SPLAT_CMD, cmd); + if (args->args_do_color) { + fprintf(stdout, "%s %s\n", rc ? + COLOR_RED "Fail" COLOR_RESET : + COLOR_GREEN "Pass" COLOR_RESET, + rc ? strerror(errno) : ""); + } else { + fprintf(stdout, "%s %s\n", rc ? + "Fail" : "Pass", + rc ? strerror(errno) : ""); + } + fflush(stdout); + free(cmd); + + if ((args->args_verbose == 1 && rc) || + (args->args_verbose >= 2)) { + if ((rc = read(splatctl_fd, splat_buffer, + splat_buffer_size - 1)) < 0) { + fprintf(stdout, "Error reading results: %d\n", rc); + } else { + fprintf(stdout, "\n%s\n", splat_buffer); + fflush(stdout); + } + } + + return rc; +} + +static int tests_run(cmd_args_t *args) +{ + ListIterator i; + test_t *test; + int rc; + + fprintf(stdout, + "------------------------------ " + "Running SPLAT Tests " + "------------------------------\n"); + + i = list_iterator_create(args->args_tests); + + while ((test = list_next(i))) { + rc = test_run(args, test); + if (rc && args->args_exit_on_error) { + list_iterator_destroy(i); + return rc; + } + } + + list_iterator_destroy(i); + return 0; +} + +static int args_parse_test(cmd_args_t *args, char *str) +{ + ListIterator si, ti; + subsystem_t *s; + test_t *t; + char *sub_str, *test_str; + int sub_num, test_num; + int sub_all = 0, test_all = 0; + int rc, flag = 0; + + test_str = strchr(str, ':'); + if (test_str == NULL) { + fprintf(stderr, "Test must be of the " + "form \n"); + return -EINVAL; + } + + sub_str = str; + test_str[0] = '\0'; + test_str = test_str + 1; + + sub_num = strtol(sub_str, NULL, 0); + test_num = strtol(test_str, NULL, 0); + + if (!strncasecmp(sub_str, "all", strlen(sub_str)) || (sub_num == -1)) + sub_all = 1; + + if (!strncasecmp(test_str,"all",strlen(test_str)) || (test_num == -1)) + test_all = 1; + + si = list_iterator_create(subsystems); + + if (sub_all) { + if (test_all) { + /* Add all tests from all subsystems */ + while ((s = list_next(si))) { + ti = list_iterator_create(s->sub_tests); + while ((t = list_next(ti))) { + if ((rc = test_add(args, t))) { + list_iterator_destroy(ti); + goto error_run; + } + } + list_iterator_destroy(ti); + } + } else { + /* Add a specific test from all subsystems */ + while ((s = list_next(si))) { + if ((t=test_find(s->sub_desc.name,test_str))) { + if ((rc = test_add(args, t))) + goto error_run; + + flag = 1; + } + } + + if (!flag) + fprintf(stderr, "No tests '%s:%s' could be " + "found\n", sub_str, test_str); + } + } else { + if (test_all) { + /* Add all tests from a specific subsystem */ + while ((s = list_next(si))) { + if (strncasecmp(sub_str, s->sub_desc.name, + strlen(sub_str))) + continue; + + ti = list_iterator_create(s->sub_tests); + while ((t = list_next(ti))) { + if ((rc = test_add(args, t))) { + list_iterator_destroy(ti); + goto error_run; + } + } + list_iterator_destroy(ti); + } + } else { + /* Add a specific test from a specific subsystem */ + if ((t = test_find(sub_str, test_str))) { + if ((rc = test_add(args, t))) + goto error_run; + } else { + fprintf(stderr, "Test '%s:%s' could not be " + "found\n", sub_str, test_str); + return -EINVAL; + } + } + } + + list_iterator_destroy(si); + + return 0; + +error_run: + list_iterator_destroy(si); + + fprintf(stderr, "Test '%s:%s' not added to run list: %d\n", + sub_str, test_str, rc); + + return rc; +} + +static void args_fini(cmd_args_t *args) +{ + assert(args != NULL); + + if (args->args_tests != NULL) + list_destroy(args->args_tests); + + free(args); +} + +static cmd_args_t * +args_init(int argc, char **argv) +{ + cmd_args_t *args; + int c, rc; + + if (argc == 1) { + usage(); + return (cmd_args_t *) NULL; + } + + /* Configure and populate the args structures */ + args = malloc(sizeof(*args)); + if (args == NULL) + return NULL; + + memset(args, 0, sizeof(*args)); + args->args_verbose = 0; + args->args_do_list = 0; + args->args_do_all = 0; + args->args_do_color = 1; + args->args_exit_on_error = 0; + args->args_tests = list_create((ListDelF)test_fini); + if (args->args_tests == NULL) { + args_fini(args); + return NULL; + } + + while ((c = getopt_long(argc, argv, shortOpts, longOpts, NULL)) != -1){ + switch (c) { + case 'v': args->args_verbose++; break; + case 'l': args->args_do_list = 1; break; + case 'a': args->args_do_all = 1; break; + case 'c': args->args_do_color = 0; break; + case 'x': args->args_exit_on_error = 1; break; + case 't': + if (args->args_do_all) { + fprintf(stderr, "Option -t is " + "useless when used with -a\n"); + args_fini(args); + return NULL; + } + + rc = args_parse_test(args, argv[optind - 1]); + if (rc) { + args_fini(args); + return NULL; + } + break; + case 'h': + case '?': + usage(); + args_fini(args); + return NULL; + default: + fprintf(stderr, "Unknown option '%s'\n", + argv[optind - 1]); + break; + } + } + + return args; +} + +static int +dev_clear(void) +{ + splat_cfg_t cfg; + int rc; + + memset(&cfg, 0, sizeof(cfg)); + cfg.cfg_magic = SPLAT_CFG_MAGIC; + cfg.cfg_cmd = SPLAT_CFG_BUFFER_CLEAR; + cfg.cfg_arg1 = 0; + + rc = ioctl(splatctl_fd, SPLAT_CFG, &cfg); + if (rc) + fprintf(stderr, "Ioctl() error %lu / %d: %d\n", + (unsigned long) SPLAT_CFG, cfg.cfg_cmd, errno); + + lseek(splatctl_fd, 0, SEEK_SET); + + return rc; +} + +static int +dev_size(int size) +{ + splat_cfg_t cfg; + int rc; + + memset(&cfg, 0, sizeof(cfg)); + cfg.cfg_magic = SPLAT_CFG_MAGIC; + cfg.cfg_cmd = SPLAT_CFG_BUFFER_SIZE; + cfg.cfg_arg1 = size; + + rc = ioctl(splatctl_fd, SPLAT_CFG, &cfg); + if (rc) { + fprintf(stderr, "Ioctl() error %lu / %d: %d\n", + (unsigned long) SPLAT_CFG, cfg.cfg_cmd, errno); + return rc; + } + + return cfg.cfg_rc1; +} + +static void +dev_fini(void) +{ + if (splat_buffer) + free(splat_buffer); + + if (splatctl_fd != -1) { + if (close(splatctl_fd) == -1) { + fprintf(stderr, "Unable to close %s: %d\n", + SPLAT_DEV, errno); + } + } +} + +static int +dev_init(void) +{ + ListIterator i; + subsystem_t *sub; + int rc; + + splatctl_fd = open(SPLAT_DEV, O_RDONLY); + if (splatctl_fd == -1) { + fprintf(stderr, "Unable to open %s: %d\n" + "Is the splat module loaded?\n", SPLAT_DEV, errno); + rc = errno; + goto error; + } + + /* Determine kernel module version string */ + memset(splat_version, 0, VERSION_SIZE); + if ((rc = read(splatctl_fd, splat_version, VERSION_SIZE - 1)) == -1) + goto error; + + if ((rc = dev_clear())) + goto error; + + if ((rc = dev_size(0)) < 0) + goto error; + + splat_buffer_size = rc; + splat_buffer = (char *)malloc(splat_buffer_size); + if (splat_buffer == NULL) { + rc = -ENOMEM; + goto error; + } + + memset(splat_buffer, 0, splat_buffer_size); + + /* Determine available subsystems */ + if ((rc = subsystem_setup()) != 0) + goto error; + + /* Determine available tests for all subsystems */ + i = list_iterator_create(subsystems); + + while ((sub = list_next(i))) { + if ((rc = test_setup(sub)) != 0) { + list_iterator_destroy(i); + goto error; + } + } + + list_iterator_destroy(i); + return 0; + +error: + if (splatctl_fd != -1) { + if (close(splatctl_fd) == -1) { + fprintf(stderr, "Unable to close %s: %d\n", + SPLAT_DEV, errno); + } + } + + return rc; +} + +int +init(void) +{ + int rc = 0; + + /* Allocate the subsystem list */ + subsystems = list_create((ListDelF)subsystem_fini); + if (subsystems == NULL) + rc = ENOMEM; + + return rc; +} + +void +fini(void) +{ + list_destroy(subsystems); +} + + +int +main(int argc, char **argv) +{ + cmd_args_t *args = NULL; + int rc = 0; + + /* General init */ + if ((rc = init())) + return rc; + + /* Device specific init */ + if ((rc = dev_init())) + goto out; + + /* Argument init and parsing */ + if ((args = args_init(argc, argv)) == NULL) { + rc = -1; + goto out; + } + + /* Generic kernel version string */ + if (args->args_verbose) + fprintf(stdout, "%s", splat_version); + + /* Print the available test list and exit */ + if (args->args_do_list) { + subsystem_list(subsystems, 0); + goto out; + } + + /* Add all available test to the list of tests to run */ + if (args->args_do_all) { + if ((rc = test_add_all(args))) + goto out; + } + + /* Run all the requested tests */ + if ((rc = tests_run(args))) + goto out; + +out: + if (args != NULL) + args_fini(args); + + dev_fini(); + fini(); + return rc; +} diff -Naur spl-0.6.5.7/cmd/splat/splat.h spl-0.6.5.7.new/cmd/splat/splat.h --- spl-0.6.5.7/cmd/splat/splat.h 1970-01-01 01:00:00.000000000 +0100 +++ spl-0.6.5.7.new/cmd/splat/splat.h 2016-08-01 16:43:23.436766051 +0200 @@ -0,0 +1,70 @@ +/*****************************************************************************\ + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . +\*****************************************************************************/ + +#ifndef _SPLAT_H +#define _SPLAT_H + +#include "list.h" +#include "../include/splat-ctl.h" + +#define DEV_NAME "/dev/splatctl" +#define COLOR_BLACK "\033[0;30m" +#define COLOR_DK_GRAY "\033[1;30m" +#define COLOR_BLUE "\033[0;34m" +#define COLOR_LT_BLUE "\033[1;34m" +#define COLOR_GREEN "\033[0;32m" +#define COLOR_LT_GREEN "\033[1;32m" +#define COLOR_CYAN "\033[0;36m" +#define COLOR_LT_CYAN "\033[1;36m" +#define COLOR_RED "\033[0;31m" +#define COLOR_LT_RED "\033[1;31m" +#define COLOR_PURPLE "\033[0;35m" +#define COLOR_LT_PURPLE "\033[1;35m" +#define COLOR_BROWN "\033[0;33m" +#define COLOR_YELLOW "\033[1;33m" +#define COLOR_LT_GRAY "\033[0;37m" +#define COLOR_WHITE "\033[1;37m" +#define COLOR_RESET "\033[0m" + +typedef struct subsystem { + splat_user_t sub_desc; /* Subsystem description */ + List sub_tests; /* Assocated subsystem tests list */ +} subsystem_t; + +typedef struct test { + splat_user_t test_desc; /* Test description */ + subsystem_t *test_sub; /* Parent subsystem */ +} test_t; + +typedef struct cmd_args { + int args_verbose; /* Verbose flag */ + int args_do_list; /* Display all tests flag */ + int args_do_all; /* Run all tests flag */ + int args_do_color; /* Colorize output */ + int args_exit_on_error; /* Exit on first error flag */ + List args_tests; /* Requested subsystems/tests */ +} cmd_args_t; + +#endif /* _SPLAT_H */ + diff -Naur spl-0.6.5.7/cmd/splat.c spl-0.6.5.7.new/cmd/splat.c --- spl-0.6.5.7/cmd/splat.c 2015-12-24 01:18:07.000000000 +0100 +++ spl-0.6.5.7.new/cmd/splat.c 1970-01-01 01:00:00.000000000 +0100 @@ -1,836 +0,0 @@ -/*****************************************************************************\ - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf . - * UCRL-CODE-235197 - * - * This file is part of the SPL, Solaris Porting Layer. - * For details, see . - * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see . - ***************************************************************************** - * Solaris Porting LAyer Tests (SPLAT) User Space Interface. -\*****************************************************************************/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "splat.h" - -#undef ioctl - -static const char shortOpts[] = "hvlat:xc"; -static const struct option longOpts[] = { - { "help", no_argument, 0, 'h' }, - { "verbose", no_argument, 0, 'v' }, - { "list", no_argument, 0, 'l' }, - { "all", no_argument, 0, 'a' }, - { "test", required_argument, 0, 't' }, - { "exit", no_argument, 0, 'x' }, - { "nocolor", no_argument, 0, 'c' }, - { 0, 0, 0, 0 } -}; - -#define VERSION_SIZE 64 - -static List subsystems; /* Subsystem/tests */ -static int splatctl_fd; /* Control file descriptor */ -static char splat_version[VERSION_SIZE]; /* Kernel version string */ -static char *splat_buffer = NULL; /* Scratch space area */ -static int splat_buffer_size = 0; /* Scratch space size */ - - -static void test_list(List, int); -static int dev_clear(void); -static void subsystem_fini(subsystem_t *); -static void test_fini(test_t *); - - -static int usage(void) { - fprintf(stderr, "usage: splat [hvla] [-t >]\n"); - fprintf(stderr, - " --help -h This help\n" - " --verbose -v Increase verbosity\n" - " --list -l List all tests in all subsystems\n" - " --all -a Run all tests in all subsystems\n" - " --test -t Run 'test' in subsystem 'sub'\n" - " --exit -x Exit on first test error\n" - " --nocolor -c Do not colorize output\n"); - fprintf(stderr, "\n" - "Examples:\n" - " splat -t kmem:all # Runs all kmem tests\n" - " splat -t taskq:0x201 # Run taskq test 0x201\n"); - - return 0; -} - -static subsystem_t *subsystem_init(splat_user_t *desc) -{ - subsystem_t *sub; - - sub = (subsystem_t *)malloc(sizeof(*sub)); - if (sub == NULL) - return NULL; - - memcpy(&sub->sub_desc, desc, sizeof(*desc)); - - sub->sub_tests = list_create((ListDelF)test_fini); - if (sub->sub_tests == NULL) { - free(sub); - return NULL; - } - - return sub; -} - -static void subsystem_fini(subsystem_t *sub) -{ - assert(sub != NULL); - free(sub); -} - -static int subsystem_setup(void) -{ - splat_cfg_t *cfg; - int i, rc, size, cfg_size; - subsystem_t *sub; - splat_user_t *desc; - - /* Aquire the number of registered subsystems */ - cfg_size = sizeof(*cfg); - cfg = (splat_cfg_t *)malloc(cfg_size); - if (cfg == NULL) - return -ENOMEM; - - memset(cfg, 0, cfg_size); - cfg->cfg_magic = SPLAT_CFG_MAGIC; - cfg->cfg_cmd = SPLAT_CFG_SUBSYSTEM_COUNT; - - rc = ioctl(splatctl_fd, SPLAT_CFG, cfg); - if (rc) { - fprintf(stderr, "Ioctl() error 0x%lx / %d: %d\n", - (unsigned long)SPLAT_CFG, cfg->cfg_cmd, errno); - free(cfg); - return rc; - } - - size = cfg->cfg_rc1; - free(cfg); - - /* Based on the newly acquired number of subsystems allocate - * memory to get the descriptive information for them all. */ - cfg_size = sizeof(*cfg) + size * sizeof(splat_user_t); - cfg = (splat_cfg_t *)malloc(cfg_size); - if (cfg == NULL) - return -ENOMEM; - - memset(cfg, 0, cfg_size); - cfg->cfg_magic = SPLAT_CFG_MAGIC; - cfg->cfg_cmd = SPLAT_CFG_SUBSYSTEM_LIST; - cfg->cfg_data.splat_subsystems.size = size; - - rc = ioctl(splatctl_fd, SPLAT_CFG, cfg); - if (rc) { - fprintf(stderr, "Ioctl() error %lu / %d: %d\n", - (unsigned long) SPLAT_CFG, cfg->cfg_cmd, errno); - free(cfg); - return rc; - } - - /* Add the new subsystems in to the global list */ - size = cfg->cfg_rc1; - for (i = 0; i < size; i++) { - desc = &(cfg->cfg_data.splat_subsystems.descs[i]); - - sub = subsystem_init(desc); - if (sub == NULL) { - fprintf(stderr, "Error initializing subsystem: %s\n", - desc->name); - free(cfg); - return -ENOMEM; - } - - list_append(subsystems, sub); - } - - free(cfg); - return 0; -} - -static void subsystem_list(List l, int indent) -{ - ListIterator i; - subsystem_t *sub; - - fprintf(stdout, - "------------------------------ " - "Available SPLAT Tests " - "------------------------------\n"); - - i = list_iterator_create(l); - - while ((sub = list_next(i))) { - fprintf(stdout, "%*s0x%0*x %-*s ---- %s ----\n", - indent, "", - 4, sub->sub_desc.id, - SPLAT_NAME_SIZE + 7, sub->sub_desc.name, - sub->sub_desc.desc); - test_list(sub->sub_tests, indent + 7); - } - - list_iterator_destroy(i); -} - -static test_t *test_init(subsystem_t *sub, splat_user_t *desc) -{ - test_t *test; - - test = (test_t *)malloc(sizeof(*test)); - if (test == NULL) - return NULL; - - test->test_sub = sub; - memcpy(&test->test_desc, desc, sizeof(*desc)); - - return test; -} - -static void test_fini(test_t *test) -{ - assert(test != NULL); - free(test); -} - -static int test_setup(subsystem_t *sub) -{ - splat_cfg_t *cfg; - int i, rc, size; - test_t *test; - splat_user_t *desc; - - /* Aquire the number of registered tests for the give subsystem */ - cfg = (splat_cfg_t *)malloc(sizeof(*cfg)); - if (cfg == NULL) - return -ENOMEM; - - memset(cfg, 0, sizeof(*cfg)); - cfg->cfg_magic = SPLAT_CFG_MAGIC; - cfg->cfg_cmd = SPLAT_CFG_TEST_COUNT; - cfg->cfg_arg1 = sub->sub_desc.id; /* Subsystem of interest */ - - rc = ioctl(splatctl_fd, SPLAT_CFG, cfg); - if (rc) { - fprintf(stderr, "Ioctl() error %lu / %d: %d\n", - (unsigned long) SPLAT_CFG, cfg->cfg_cmd, errno); - free(cfg); - return rc; - } - - size = cfg->cfg_rc1; - free(cfg); - - /* Based on the newly aquired number of tests allocate enough - * memory to get the descriptive information for them all. */ - cfg = (splat_cfg_t *)malloc(sizeof(*cfg) + size*sizeof(splat_user_t)); - if (cfg == NULL) - return -ENOMEM; - - memset(cfg, 0, sizeof(*cfg) + size * sizeof(splat_user_t)); - cfg->cfg_magic = SPLAT_CFG_MAGIC; - cfg->cfg_cmd = SPLAT_CFG_TEST_LIST; - cfg->cfg_arg1 = sub->sub_desc.id; /* Subsystem of interest */ - cfg->cfg_data.splat_tests.size = size; - - rc = ioctl(splatctl_fd, SPLAT_CFG, cfg); - if (rc) { - fprintf(stderr, "Ioctl() error %lu / %d: %d\n", - (unsigned long) SPLAT_CFG, cfg->cfg_cmd, errno); - free(cfg); - return rc; - } - - /* Add the new tests in to the relevant subsystems */ - size = cfg->cfg_rc1; - for (i = 0; i < size; i++) { - desc = &(cfg->cfg_data.splat_tests.descs[i]); - - test = test_init(sub, desc); - if (test == NULL) { - fprintf(stderr, "Error initializing test: %s\n", - desc->name); - free(cfg); - return -ENOMEM; - } - - list_append(sub->sub_tests, test); - } - - free(cfg); - return 0; -} - -static test_t *test_copy(test_t *test) -{ - return test_init(test->test_sub, &test->test_desc); -} - -static void test_list(List l, int indent) -{ - ListIterator i; - test_t *test; - - i = list_iterator_create(l); - - while ((test = list_next(i))) - fprintf(stdout, "%*s0x%0*x %-*s %s\n", - indent, "", 04, test->test_desc.id, - SPLAT_NAME_SIZE, test->test_desc.name, - test->test_desc.desc); - - list_iterator_destroy(i); -} - -static test_t *test_find(char *sub_str, char *test_str) -{ - ListIterator si, ti; - subsystem_t *sub; - test_t *test; - __u32 sub_num, test_num; - - /* - * No error checking here because it may not be a number, it's - * perfectly OK for it to be a string. Since we're just using - * it for comparison purposes this is all very safe. - */ - sub_num = strtoul(sub_str, NULL, 0); - test_num = strtoul(test_str, NULL, 0); - - si = list_iterator_create(subsystems); - - while ((sub = list_next(si))) { - - if (strncmp(sub->sub_desc.name, sub_str, SPLAT_NAME_SIZE) && - sub->sub_desc.id != sub_num) - continue; - - ti = list_iterator_create(sub->sub_tests); - - while ((test = list_next(ti))) { - - if (!strncmp(test->test_desc.name, test_str, - SPLAT_NAME_SIZE) || test->test_desc.id==test_num) { - list_iterator_destroy(ti); - list_iterator_destroy(si); - return test; - } - } - - list_iterator_destroy(ti); - } - - list_iterator_destroy(si); - - return NULL; -} - -static int test_add(cmd_args_t *args, test_t *test) -{ - test_t *tmp; - - tmp = test_copy(test); - if (tmp == NULL) - return -ENOMEM; - - list_append(args->args_tests, tmp); - return 0; -} - -static int test_add_all(cmd_args_t *args) -{ - ListIterator si, ti; - subsystem_t *sub; - test_t *test; - int rc; - - si = list_iterator_create(subsystems); - - while ((sub = list_next(si))) { - ti = list_iterator_create(sub->sub_tests); - - while ((test = list_next(ti))) { - if ((rc = test_add(args, test))) { - list_iterator_destroy(ti); - list_iterator_destroy(si); - return rc; - } - } - - list_iterator_destroy(ti); - } - - list_iterator_destroy(si); - - return 0; -} - -static int test_run(cmd_args_t *args, test_t *test) -{ - subsystem_t *sub = test->test_sub; - splat_cmd_t *cmd; - int rc, cmd_size; - - dev_clear(); - - cmd_size = sizeof(*cmd); - cmd = (splat_cmd_t *)malloc(cmd_size); - if (cmd == NULL) - return -ENOMEM; - - memset(cmd, 0, cmd_size); - cmd->cmd_magic = SPLAT_CMD_MAGIC; - cmd->cmd_subsystem = sub->sub_desc.id; - cmd->cmd_test = test->test_desc.id; - cmd->cmd_data_size = 0; /* Unused feature */ - - fprintf(stdout, "%*s:%-*s ", - SPLAT_NAME_SIZE, sub->sub_desc.name, - SPLAT_NAME_SIZE, test->test_desc.name); - fflush(stdout); - rc = ioctl(splatctl_fd, SPLAT_CMD, cmd); - if (args->args_do_color) { - fprintf(stdout, "%s %s\n", rc ? - COLOR_RED "Fail" COLOR_RESET : - COLOR_GREEN "Pass" COLOR_RESET, - rc ? strerror(errno) : ""); - } else { - fprintf(stdout, "%s %s\n", rc ? - "Fail" : "Pass", - rc ? strerror(errno) : ""); - } - fflush(stdout); - free(cmd); - - if ((args->args_verbose == 1 && rc) || - (args->args_verbose >= 2)) { - if ((rc = read(splatctl_fd, splat_buffer, - splat_buffer_size - 1)) < 0) { - fprintf(stdout, "Error reading results: %d\n", rc); - } else { - fprintf(stdout, "\n%s\n", splat_buffer); - fflush(stdout); - } - } - - return rc; -} - -static int tests_run(cmd_args_t *args) -{ - ListIterator i; - test_t *test; - int rc; - - fprintf(stdout, - "------------------------------ " - "Running SPLAT Tests " - "------------------------------\n"); - - i = list_iterator_create(args->args_tests); - - while ((test = list_next(i))) { - rc = test_run(args, test); - if (rc && args->args_exit_on_error) { - list_iterator_destroy(i); - return rc; - } - } - - list_iterator_destroy(i); - return 0; -} - -static int args_parse_test(cmd_args_t *args, char *str) -{ - ListIterator si, ti; - subsystem_t *s; - test_t *t; - char *sub_str, *test_str; - int sub_num, test_num; - int sub_all = 0, test_all = 0; - int rc, flag = 0; - - test_str = strchr(str, ':'); - if (test_str == NULL) { - fprintf(stderr, "Test must be of the " - "form \n"); - return -EINVAL; - } - - sub_str = str; - test_str[0] = '\0'; - test_str = test_str + 1; - - sub_num = strtol(sub_str, NULL, 0); - test_num = strtol(test_str, NULL, 0); - - if (!strncasecmp(sub_str, "all", strlen(sub_str)) || (sub_num == -1)) - sub_all = 1; - - if (!strncasecmp(test_str,"all",strlen(test_str)) || (test_num == -1)) - test_all = 1; - - si = list_iterator_create(subsystems); - - if (sub_all) { - if (test_all) { - /* Add all tests from all subsystems */ - while ((s = list_next(si))) { - ti = list_iterator_create(s->sub_tests); - while ((t = list_next(ti))) { - if ((rc = test_add(args, t))) { - list_iterator_destroy(ti); - goto error_run; - } - } - list_iterator_destroy(ti); - } - } else { - /* Add a specific test from all subsystems */ - while ((s = list_next(si))) { - if ((t=test_find(s->sub_desc.name,test_str))) { - if ((rc = test_add(args, t))) - goto error_run; - - flag = 1; - } - } - - if (!flag) - fprintf(stderr, "No tests '%s:%s' could be " - "found\n", sub_str, test_str); - } - } else { - if (test_all) { - /* Add all tests from a specific subsystem */ - while ((s = list_next(si))) { - if (strncasecmp(sub_str, s->sub_desc.name, - strlen(sub_str))) - continue; - - ti = list_iterator_create(s->sub_tests); - while ((t = list_next(ti))) { - if ((rc = test_add(args, t))) { - list_iterator_destroy(ti); - goto error_run; - } - } - list_iterator_destroy(ti); - } - } else { - /* Add a specific test from a specific subsystem */ - if ((t = test_find(sub_str, test_str))) { - if ((rc = test_add(args, t))) - goto error_run; - } else { - fprintf(stderr, "Test '%s:%s' could not be " - "found\n", sub_str, test_str); - return -EINVAL; - } - } - } - - list_iterator_destroy(si); - - return 0; - -error_run: - list_iterator_destroy(si); - - fprintf(stderr, "Test '%s:%s' not added to run list: %d\n", - sub_str, test_str, rc); - - return rc; -} - -static void args_fini(cmd_args_t *args) -{ - assert(args != NULL); - - if (args->args_tests != NULL) - list_destroy(args->args_tests); - - free(args); -} - -static cmd_args_t * -args_init(int argc, char **argv) -{ - cmd_args_t *args; - int c, rc; - - if (argc == 1) { - usage(); - return (cmd_args_t *) NULL; - } - - /* Configure and populate the args structures */ - args = malloc(sizeof(*args)); - if (args == NULL) - return NULL; - - memset(args, 0, sizeof(*args)); - args->args_verbose = 0; - args->args_do_list = 0; - args->args_do_all = 0; - args->args_do_color = 1; - args->args_exit_on_error = 0; - args->args_tests = list_create((ListDelF)test_fini); - if (args->args_tests == NULL) { - args_fini(args); - return NULL; - } - - while ((c = getopt_long(argc, argv, shortOpts, longOpts, NULL)) != -1){ - switch (c) { - case 'v': args->args_verbose++; break; - case 'l': args->args_do_list = 1; break; - case 'a': args->args_do_all = 1; break; - case 'c': args->args_do_color = 0; break; - case 'x': args->args_exit_on_error = 1; break; - case 't': - if (args->args_do_all) { - fprintf(stderr, "Option -t is " - "useless when used with -a\n"); - args_fini(args); - return NULL; - } - - rc = args_parse_test(args, argv[optind - 1]); - if (rc) { - args_fini(args); - return NULL; - } - break; - case 'h': - case '?': - usage(); - args_fini(args); - return NULL; - default: - fprintf(stderr, "Unknown option '%s'\n", - argv[optind - 1]); - break; - } - } - - return args; -} - -static int -dev_clear(void) -{ - splat_cfg_t cfg; - int rc; - - memset(&cfg, 0, sizeof(cfg)); - cfg.cfg_magic = SPLAT_CFG_MAGIC; - cfg.cfg_cmd = SPLAT_CFG_BUFFER_CLEAR; - cfg.cfg_arg1 = 0; - - rc = ioctl(splatctl_fd, SPLAT_CFG, &cfg); - if (rc) - fprintf(stderr, "Ioctl() error %lu / %d: %d\n", - (unsigned long) SPLAT_CFG, cfg.cfg_cmd, errno); - - lseek(splatctl_fd, 0, SEEK_SET); - - return rc; -} - -static int -dev_size(int size) -{ - splat_cfg_t cfg; - int rc; - - memset(&cfg, 0, sizeof(cfg)); - cfg.cfg_magic = SPLAT_CFG_MAGIC; - cfg.cfg_cmd = SPLAT_CFG_BUFFER_SIZE; - cfg.cfg_arg1 = size; - - rc = ioctl(splatctl_fd, SPLAT_CFG, &cfg); - if (rc) { - fprintf(stderr, "Ioctl() error %lu / %d: %d\n", - (unsigned long) SPLAT_CFG, cfg.cfg_cmd, errno); - return rc; - } - - return cfg.cfg_rc1; -} - -static void -dev_fini(void) -{ - if (splat_buffer) - free(splat_buffer); - - if (splatctl_fd != -1) { - if (close(splatctl_fd) == -1) { - fprintf(stderr, "Unable to close %s: %d\n", - SPLAT_DEV, errno); - } - } -} - -static int -dev_init(void) -{ - ListIterator i; - subsystem_t *sub; - int rc; - - splatctl_fd = open(SPLAT_DEV, O_RDONLY); - if (splatctl_fd == -1) { - fprintf(stderr, "Unable to open %s: %d\n" - "Is the splat module loaded?\n", SPLAT_DEV, errno); - rc = errno; - goto error; - } - - /* Determine kernel module version string */ - memset(splat_version, 0, VERSION_SIZE); - if ((rc = read(splatctl_fd, splat_version, VERSION_SIZE - 1)) == -1) - goto error; - - if ((rc = dev_clear())) - goto error; - - if ((rc = dev_size(0)) < 0) - goto error; - - splat_buffer_size = rc; - splat_buffer = (char *)malloc(splat_buffer_size); - if (splat_buffer == NULL) { - rc = -ENOMEM; - goto error; - } - - memset(splat_buffer, 0, splat_buffer_size); - - /* Determine available subsystems */ - if ((rc = subsystem_setup()) != 0) - goto error; - - /* Determine available tests for all subsystems */ - i = list_iterator_create(subsystems); - - while ((sub = list_next(i))) { - if ((rc = test_setup(sub)) != 0) { - list_iterator_destroy(i); - goto error; - } - } - - list_iterator_destroy(i); - return 0; - -error: - if (splatctl_fd != -1) { - if (close(splatctl_fd) == -1) { - fprintf(stderr, "Unable to close %s: %d\n", - SPLAT_DEV, errno); - } - } - - return rc; -} - -int -init(void) -{ - int rc = 0; - - /* Allocate the subsystem list */ - subsystems = list_create((ListDelF)subsystem_fini); - if (subsystems == NULL) - rc = ENOMEM; - - return rc; -} - -void -fini(void) -{ - list_destroy(subsystems); -} - - -int -main(int argc, char **argv) -{ - cmd_args_t *args = NULL; - int rc = 0; - - /* General init */ - if ((rc = init())) - return rc; - - /* Device specific init */ - if ((rc = dev_init())) - goto out; - - /* Argument init and parsing */ - if ((args = args_init(argc, argv)) == NULL) { - rc = -1; - goto out; - } - - /* Generic kernel version string */ - if (args->args_verbose) - fprintf(stdout, "%s", splat_version); - - /* Print the available test list and exit */ - if (args->args_do_list) { - subsystem_list(subsystems, 0); - goto out; - } - - /* Add all available test to the list of tests to run */ - if (args->args_do_all) { - if ((rc = test_add_all(args))) - goto out; - } - - /* Run all the requested tests */ - if ((rc = tests_run(args))) - goto out; - -out: - if (args != NULL) - args_fini(args); - - dev_fini(); - fini(); - return rc; -} diff -Naur spl-0.6.5.7/cmd/splat.h spl-0.6.5.7.new/cmd/splat.h --- spl-0.6.5.7/cmd/splat.h 2015-12-24 01:18:07.000000000 +0100 +++ spl-0.6.5.7.new/cmd/splat.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,70 +0,0 @@ -/*****************************************************************************\ - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf . - * UCRL-CODE-235197 - * - * This file is part of the SPL, Solaris Porting Layer. - * For details, see . - * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see . -\*****************************************************************************/ - -#ifndef _SPLAT_H -#define _SPLAT_H - -#include "list.h" -#include "../include/splat-ctl.h" - -#define DEV_NAME "/dev/splatctl" -#define COLOR_BLACK "\033[0;30m" -#define COLOR_DK_GRAY "\033[1;30m" -#define COLOR_BLUE "\033[0;34m" -#define COLOR_LT_BLUE "\033[1;34m" -#define COLOR_GREEN "\033[0;32m" -#define COLOR_LT_GREEN "\033[1;32m" -#define COLOR_CYAN "\033[0;36m" -#define COLOR_LT_CYAN "\033[1;36m" -#define COLOR_RED "\033[0;31m" -#define COLOR_LT_RED "\033[1;31m" -#define COLOR_PURPLE "\033[0;35m" -#define COLOR_LT_PURPLE "\033[1;35m" -#define COLOR_BROWN "\033[0;33m" -#define COLOR_YELLOW "\033[1;33m" -#define COLOR_LT_GRAY "\033[0;37m" -#define COLOR_WHITE "\033[1;37m" -#define COLOR_RESET "\033[0m" - -typedef struct subsystem { - splat_user_t sub_desc; /* Subsystem description */ - List sub_tests; /* Assocated subsystem tests list */ -} subsystem_t; - -typedef struct test { - splat_user_t test_desc; /* Test description */ - subsystem_t *test_sub; /* Parent subsystem */ -} test_t; - -typedef struct cmd_args { - int args_verbose; /* Verbose flag */ - int args_do_list; /* Display all tests flag */ - int args_do_all; /* Run all tests flag */ - int args_do_color; /* Colorize output */ - int args_exit_on_error; /* Exit on first error flag */ - List args_tests; /* Requested subsystems/tests */ -} cmd_args_t; - -#endif /* _SPLAT_H */ - diff -Naur spl-0.6.5.7/cmd/splslab/Makefile.am spl-0.6.5.7.new/cmd/splslab/Makefile.am --- spl-0.6.5.7/cmd/splslab/Makefile.am 1970-01-01 01:00:00.000000000 +0100 +++ spl-0.6.5.7.new/cmd/splslab/Makefile.am 2016-08-01 16:43:23.436766051 +0200 @@ -0,0 +1,2 @@ +bin_SCRIPTS = splslab.py +EXTRA_DIST = $(bin_SCRIPTS) diff -Naur spl-0.6.5.7/cmd/splslab/splslab.py spl-0.6.5.7.new/cmd/splslab/splslab.py --- spl-0.6.5.7/cmd/splslab/splslab.py 1970-01-01 01:00:00.000000000 +0100 +++ spl-0.6.5.7.new/cmd/splslab/splslab.py 2016-08-01 16:43:23.436766051 +0200 @@ -0,0 +1,202 @@ +#!/usr/bin/python + +import sys +import time +import getopt +import re +import signal +from collections import defaultdict + +class Stat: + # flag definitions based on the kmem.h + NOTOUCH = 1 + NODEBUG = 2 + KMEM = 32 + VMEM = 64 + SLAB = 128 + OFFSLAB = 256 + NOEMERGENCY = 512 + DEADLOCKED = 16384 + GROWING = 32768 + REAPING = 65536 + DESTROY = 131072 + + fdefs = { + NOTOUCH : "NTCH", + NODEBUG : "NDBG", + KMEM : "KMEM", + VMEM : "VMEM", + SLAB : "SLAB", + OFFSLAB : "OFSL", + NOEMERGENCY : "NEMG", + DEADLOCKED : "DDLK", + GROWING : "GROW", + REAPING : "REAP", + DESTROY : "DSTR" + } + + def __init__(self, name, flags, size, alloc, slabsize, objsize): + self._name = name + self._flags = self.f2str(flags) + self._size = size + self._alloc = alloc + self._slabsize = slabsize + self._objsize = objsize + + def f2str(self, flags): + fstring = '' + for k in Stat.fdefs.keys(): + if flags & k: + fstring = fstring + Stat.fdefs[k] + '|' + + fstring = fstring[:-1] + return fstring + +class CumulativeStat: + def __init__(self, skey="a"): + self._size = 0 + self._alloc = 0 + self._pct = 0 + self._skey = skey + self._regexp = \ + re.compile('(\w+)\s+(\w+)\s+(\w+)\s+(\w+)\s+(\w+)\s+(\w+)\s+'); + self._stats = defaultdict(list) + + # Add another stat to the dictionary and re-calculate the totals + def add(self, s): + key = 0 + if self._skey == "a": + key = s._alloc + else: + key = s._size + self._stats[key].append(s) + self._size = self._size + s._size + self._alloc = self._alloc + s._alloc + if self._size: + self._pct = self._alloc * 100 / self._size + else: + self._pct = 0 + + # Parse the slab info in the procfs + # Calculate cumulative stats + def slab_update(self): + k = [line.strip() for line in open('/proc/spl/kmem/slab')] + + if not k: + sys.stderr.write("No SPL slab stats found\n") + sys.exit(1) + + del k[0:2] + + for s in k: + if not s: + continue + m = self._regexp.match(s) + if m: + self.add(Stat(m.group(1), int(m.group(2),16), int(m.group(3)), + int(m.group(4)), int(m.group(5)), int(m.group(6)))) + else: + sys.stderr.write("Error: unexpected input format\n" % s) + exit(-1) + + def show_header(self): + sys.stdout.write("\n%25s %20s %15s %15s %15s %15s\n\n" % \ + ("cache name", "flags", "size", "alloc", "slabsize", "objsize")) + + # Show up to the number of 'rows' of output sorted in descending order + # by the key specified earlier; if rows == 0, all rows are shown + def show(self, rows): + self.show_header() + i = 1 + done = False + for k in reversed(sorted(self._stats.keys())): + for s in self._stats[k]: + sys.stdout.write("%25s %20s %15d %15d %15d %15d\n" % \ + (s._name, s._flags, s._size, s._alloc, \ + s._slabsize, s._objsize)) + i = i + 1 + if rows != 0 and i > rows: + done = True + break + if done: + break + sys.stdout.write("%25s %36d %15d (%d%%)\n\n" % \ + ("Totals:", self._size, self._alloc, self._pct)) + +def usage(): + cmd = "Usage: splslab.py [-n|--num-rows] number [-s|--sort-by] " + \ + "[interval] [count]"; + sys.stderr.write("%s\n" % cmd) + sys.stderr.write("\t-h : print help\n") + sys.stderr.write("\t-n : --num-rows N : limit output to N top " + + "largest slabs (default: all)\n") + sys.stderr.write("\t-s : --sort-by key : sort output in descending " + + "order by total size (s)\n\t\tor allocated size (a) " + + "(default: a)\n") + sys.stderr.write("\tinterval : repeat every interval seconds\n") + sys.stderr.write("\tcount : output statistics count times and exit\n") + + +def main(): + + rows = 0 + count = 0 + skey = "a" + interval = 1 + + signal.signal(signal.SIGINT, signal.SIG_DFL) + + try: + opts, args = getopt.getopt( + sys.argv[1:], + "n:s:h", + [ + "num-rows", + "sort-by", + "help" + ] + ) + except getopt.error as e: + sys.stderr.write("Error: %s\n" % e.msg) + usage() + exit(-1) + + i = 1 + for opt, arg in opts: + if opt in ('-n', '--num-rows'): + rows = int(arg) + i = i + 2 + elif opt in ('-s', '--sort-by'): + if arg != "s" and arg != "a": + sys.stderr.write("Error: invalid sorting key \"%s\"\n" % arg) + usage() + exit(-1) + skey = arg + i = i + 2 + elif opt in ('-h', '--help'): + usage() + exit(0) + else: + break + + args = sys.argv[i:] + + interval = int(args[0]) if len(args) else interval + count = int(args[1]) if len(args) > 1 else count + + i = 0 + while True: + cs = CumulativeStat(skey) + cs.slab_update() + cs.show(rows) + + i = i + 1 + if count and i >= count: + break + + time.sleep(interval) + + return 0 + +if __name__ == '__main__': + main() diff -Naur spl-0.6.5.7/config/spl-build.m4 spl-0.6.5.7.new/config/spl-build.m4 --- spl-0.6.5.7/config/spl-build.m4 2015-12-24 01:31:01.000000000 +0100 +++ spl-0.6.5.7.new/config/spl-build.m4 2016-08-01 16:43:34.280796341 +0200 @@ -39,11 +39,15 @@ SPL_AC_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE SPL_AC_SHRINK_CONTROL_STRUCT SPL_AC_RWSEM_SPINLOCK_IS_RAW + SPL_AC_RWSEM_ACTIVITY + SPL_AC_RWSEM_ATOMIC_LONG_COUNT SPL_AC_SCHED_RT_HEADER SPL_AC_2ARGS_VFS_GETATTR SPL_AC_USLEEP_RANGE SPL_AC_KMEM_CACHE_ALLOCFLAGS SPL_AC_WAIT_ON_BIT + SPL_AC_MUTEX_OWNER + SPL_AC_INODE_LOCK ]) AC_DEFUN([SPL_AC_MODULE_SYMVERS], [ @@ -1315,6 +1319,55 @@ ]) dnl # +dnl # 3.16 API Change +dnl # +dnl # rwsem-spinlock "->activity" changed to "->count" +dnl # +AC_DEFUN([SPL_AC_RWSEM_ACTIVITY], [ + AC_MSG_CHECKING([whether struct rw_semaphore has member activity]) + tmp_flags="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="-Werror" + SPL_LINUX_TRY_COMPILE([ + #include + ],[ + struct rw_semaphore dummy_semaphore __attribute__ ((unused)); + dummy_semaphore.activity = 0; + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_RWSEM_ACTIVITY, 1, + [struct rw_semaphore has member activity]) + ],[ + AC_MSG_RESULT(no) + ]) + EXTRA_KCFLAGS="$tmp_flags" +]) + +dnl # +dnl # 4.8 API Change +dnl # +dnl # rwsem "->count" changed to atomic_long_t type +dnl # +AC_DEFUN([SPL_AC_RWSEM_ATOMIC_LONG_COUNT], [ + AC_MSG_CHECKING( + [whether struct rw_semaphore has atomic_long_t member count]) + tmp_flags="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="-Werror" + SPL_LINUX_TRY_COMPILE([ + #include + ],[ + DECLARE_RWSEM(dummy_semaphore); + (void) atomic_long_read(&dummy_semaphore.count); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_RWSEM_ATOMIC_LONG_COUNT, 1, + [struct rw_semaphore has atomic_long_t member count]) + ],[ + AC_MSG_RESULT(no) + ]) + EXTRA_KCFLAGS="$tmp_flags" +]) + +dnl # dnl # 3.9 API change, dnl # Moved things from linux/sched.h to linux/sched/rt.h dnl # @@ -1447,3 +1500,55 @@ AC_MSG_RESULT(no) ]) ]) + +dnl # +dnl # Check whether mutex has owner with task_struct type. +dnl # +dnl # Note that before Linux 3.0, mutex owner is of type thread_info. +dnl # +dnl # Note that in Linux 3.18, the condition for owner is changed from +dnl # defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP) to +dnl # defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER) +dnl # +AC_DEFUN([SPL_AC_MUTEX_OWNER], [ + AC_MSG_CHECKING([whether mutex has owner]) + tmp_flags="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="-Werror" + SPL_LINUX_TRY_COMPILE([ + #include + ],[ + DEFINE_MUTEX(m); + struct task_struct *t __attribute__ ((unused)); + t = m.owner; + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_MUTEX_OWNER, 1, [yes]) + ],[ + AC_MSG_RESULT(no) + ]) + EXTRA_KCFLAGS="$tmp_flags" +]) + +dnl # +dnl # 4.7 API change +dnl # i_mutex is changed to i_rwsem. Instead of directly using +dnl # i_mutex/i_rwsem, we should use inode_lock() and inode_lock_shared() +dnl # We test inode_lock_shared because inode_lock is introduced earlier. +dnl # +AC_DEFUN([SPL_AC_INODE_LOCK], [ + AC_MSG_CHECKING([whether inode_lock_shared() exists]) + tmp_flags="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="-Werror" + SPL_LINUX_TRY_COMPILE([ + #include + ],[ + struct inode *inode = NULL; + inode_lock_shared(inode); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INODE_LOCK_SHARED, 1, [yes]) + ],[ + AC_MSG_RESULT(no) + ]) + EXTRA_KCFLAGS="$tmp_flags" +]) diff -Naur spl-0.6.5.7/config/spl-build.m4.orig spl-0.6.5.7.new/config/spl-build.m4.orig --- spl-0.6.5.7/config/spl-build.m4.orig 1970-01-01 01:00:00.000000000 +0100 +++ spl-0.6.5.7.new/config/spl-build.m4.orig 2016-08-01 16:43:25.376771470 +0200 @@ -0,0 +1,1449 @@ +############################################################################### +# Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. +# Copyright (C) 2007 The Regents of the University of California. +# Written by Brian Behlendorf . +############################################################################### +# SPL_AC_CONFIG_KERNEL: Default SPL kernel configuration. +############################################################################### + +AC_DEFUN([SPL_AC_CONFIG_KERNEL], [ + SPL_AC_KERNEL + + if test "${LINUX_OBJ}" != "${LINUX}"; then + KERNELMAKE_PARAMS="$KERNELMAKE_PARAMS O=$LINUX_OBJ" + fi + AC_SUBST(KERNELMAKE_PARAMS) + + KERNELCPPFLAGS="$KERNELCPPFLAGS -Wstrict-prototypes" + AC_SUBST(KERNELCPPFLAGS) + + SPL_AC_DEBUG + SPL_AC_DEBUG_KMEM + SPL_AC_DEBUG_KMEM_TRACKING + SPL_AC_TEST_MODULE + SPL_AC_ATOMIC_SPINLOCK + SPL_AC_SHRINKER_CALLBACK + SPL_AC_CTL_NAME + SPL_AC_PDE_DATA + SPL_AC_SET_FS_PWD_WITH_CONST + SPL_AC_2ARGS_VFS_UNLINK + SPL_AC_4ARGS_VFS_RENAME + SPL_AC_2ARGS_VFS_FSYNC + SPL_AC_INODE_TRUNCATE_RANGE + SPL_AC_FS_STRUCT_SPINLOCK + SPL_AC_KUIDGID_T + SPL_AC_PUT_TASK_STRUCT + SPL_AC_KERNEL_FALLOCATE + SPL_AC_CONFIG_ZLIB_INFLATE + SPL_AC_CONFIG_ZLIB_DEFLATE + SPL_AC_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE + SPL_AC_SHRINK_CONTROL_STRUCT + SPL_AC_RWSEM_SPINLOCK_IS_RAW + SPL_AC_SCHED_RT_HEADER + SPL_AC_2ARGS_VFS_GETATTR + SPL_AC_USLEEP_RANGE + SPL_AC_KMEM_CACHE_ALLOCFLAGS + SPL_AC_WAIT_ON_BIT +]) + +AC_DEFUN([SPL_AC_MODULE_SYMVERS], [ + modpost=$LINUX/scripts/Makefile.modpost + AC_MSG_CHECKING([kernel file name for module symbols]) + if test "x$enable_linux_builtin" != xyes -a -f "$modpost"; then + if grep -q Modules.symvers $modpost; then + LINUX_SYMBOLS=Modules.symvers + else + LINUX_SYMBOLS=Module.symvers + fi + + if ! test -f "$LINUX_OBJ/$LINUX_SYMBOLS"; then + AC_MSG_ERROR([ + *** Please make sure the kernel devel package for your distribution + *** is installed. If you are building with a custom kernel, make sure the + *** kernel is configured, built, and the '--with-linux=PATH' configure + *** option refers to the location of the kernel source.]) + fi + else + LINUX_SYMBOLS=NONE + fi + AC_MSG_RESULT($LINUX_SYMBOLS) + AC_SUBST(LINUX_SYMBOLS) +]) + +AC_DEFUN([SPL_AC_KERNEL], [ + AC_ARG_WITH([linux], + AS_HELP_STRING([--with-linux=PATH], + [Path to kernel source]), + [kernelsrc="$withval"]) + + AC_ARG_WITH([linux-obj], + AS_HELP_STRING([--with-linux-obj=PATH], + [Path to kernel build objects]), + [kernelbuild="$withval"]) + + AC_MSG_CHECKING([kernel source directory]) + if test -z "$kernelsrc"; then + if test -e "/lib/modules/$(uname -r)/source"; then + headersdir="/lib/modules/$(uname -r)/source" + sourcelink=$(readlink -f "$headersdir") + elif test -e "/lib/modules/$(uname -r)/build"; then + headersdir="/lib/modules/$(uname -r)/build" + sourcelink=$(readlink -f "$headersdir") + else + sourcelink=$(ls -1d /usr/src/kernels/* \ + /usr/src/linux-* \ + 2>/dev/null | grep -v obj | tail -1) + fi + + if test -n "$sourcelink" && test -e ${sourcelink}; then + kernelsrc=`readlink -f ${sourcelink}` + else + kernelsrc="[Not found]" + fi + else + if test "$kernelsrc" = "NONE"; then + kernsrcver=NONE + fi + fi + + AC_MSG_RESULT([$kernelsrc]) + if test ! -d "$kernelsrc"; then + AC_MSG_ERROR([ + *** Please make sure the kernel devel package for your distribution + *** is installed and then try again. If that fails, you can specify the + *** location of the kernel source with the '--with-linux=PATH' option.]) + fi + + AC_MSG_CHECKING([kernel build directory]) + if test -z "$kernelbuild"; then + if test -e "/lib/modules/$(uname -r)/build"; then + kernelbuild=`readlink -f /lib/modules/$(uname -r)/build` + elif test -d ${kernelsrc}-obj/${target_cpu}/${target_cpu}; then + kernelbuild=${kernelsrc}-obj/${target_cpu}/${target_cpu} + elif test -d ${kernelsrc}-obj/${target_cpu}/default; then + kernelbuild=${kernelsrc}-obj/${target_cpu}/default + elif test -d `dirname ${kernelsrc}`/build-${target_cpu}; then + kernelbuild=`dirname ${kernelsrc}`/build-${target_cpu} + else + kernelbuild=${kernelsrc} + fi + fi + AC_MSG_RESULT([$kernelbuild]) + + AC_MSG_CHECKING([kernel source version]) + utsrelease1=$kernelbuild/include/linux/version.h + utsrelease2=$kernelbuild/include/linux/utsrelease.h + utsrelease3=$kernelbuild/include/generated/utsrelease.h + if test -r $utsrelease1 && fgrep -q UTS_RELEASE $utsrelease1; then + utsrelease=linux/version.h + elif test -r $utsrelease2 && fgrep -q UTS_RELEASE $utsrelease2; then + utsrelease=linux/utsrelease.h + elif test -r $utsrelease3 && fgrep -q UTS_RELEASE $utsrelease3; then + utsrelease=generated/utsrelease.h + fi + + if test "$utsrelease"; then + kernsrcver=`(echo "#include <$utsrelease>"; + echo "kernsrcver=UTS_RELEASE") | + cpp -I $kernelbuild/include | + grep "^kernsrcver=" | cut -d \" -f 2` + + if test -z "$kernsrcver"; then + AC_MSG_RESULT([Not found]) + AC_MSG_ERROR([*** Cannot determine kernel version.]) + fi + else + AC_MSG_RESULT([Not found]) + if test "x$enable_linux_builtin" != xyes; then + AC_MSG_ERROR([*** Cannot find UTS_RELEASE definition.]) + else + AC_MSG_ERROR([ + *** Cannot find UTS_RELEASE definition. + *** Please run 'make prepare' inside the kernel source tree.]) + fi + fi + + AC_MSG_RESULT([$kernsrcver]) + + LINUX=${kernelsrc} + LINUX_OBJ=${kernelbuild} + LINUX_VERSION=${kernsrcver} + + AC_SUBST(LINUX) + AC_SUBST(LINUX_OBJ) + AC_SUBST(LINUX_VERSION) + + SPL_AC_MODULE_SYMVERS +]) + +dnl # +dnl # Default SPL user configuration +dnl # +AC_DEFUN([SPL_AC_CONFIG_USER], []) + +dnl # +dnl # Check for rpm+rpmbuild to build RPM packages. If these tools +dnl # are missing, it is non-fatal, but you will not be able to build +dnl # RPM packages and will be warned if you try too. +dnl # +dnl # By default, the generic spec file will be used because it requires +dnl # minimal dependencies. Distribution specific spec files can be +dnl # placed under the 'rpm/' directory and enabled using +dnl # the --with-spec= configure option. +dnl # +AC_DEFUN([SPL_AC_RPM], [ + RPM=rpm + RPMBUILD=rpmbuild + + AC_MSG_CHECKING([whether $RPM is available]) + AS_IF([tmp=$($RPM --version 2>/dev/null)], [ + RPM_VERSION=$(echo $tmp | $AWK '/RPM/ { print $[3] }') + HAVE_RPM=yes + AC_MSG_RESULT([$HAVE_RPM ($RPM_VERSION)]) + ],[ + HAVE_RPM=no + AC_MSG_RESULT([$HAVE_RPM]) + ]) + + AC_MSG_CHECKING([whether $RPMBUILD is available]) + AS_IF([tmp=$($RPMBUILD --version 2>/dev/null)], [ + RPMBUILD_VERSION=$(echo $tmp | $AWK '/RPM/ { print $[3] }') + HAVE_RPMBUILD=yes + AC_MSG_RESULT([$HAVE_RPMBUILD ($RPMBUILD_VERSION)]) + ],[ + HAVE_RPMBUILD=no + AC_MSG_RESULT([$HAVE_RPMBUILD]) + ]) + + RPM_DEFINE_COMMON='--define "$(DEBUG_SPL) 1" --define "$(DEBUG_KMEM) 1" --define "$(DEBUG_KMEM_TRACKING) 1"' + RPM_DEFINE_UTIL= + RPM_DEFINE_KMOD='--define "kernels $(LINUX_VERSION)"' + RPM_DEFINE_DKMS= + + SRPM_DEFINE_COMMON='--define "build_src_rpm 1"' + SRPM_DEFINE_UTIL= + SRPM_DEFINE_KMOD= + SRPM_DEFINE_DKMS= + + RPM_SPEC_DIR="rpm/generic" + AC_ARG_WITH([spec], + AS_HELP_STRING([--with-spec=SPEC], + [Spec files 'generic|redhat']), + [RPM_SPEC_DIR="rpm/$withval"]) + + AC_MSG_CHECKING([whether spec files are available]) + AC_MSG_RESULT([yes ($RPM_SPEC_DIR/*.spec.in)]) + + AC_SUBST(HAVE_RPM) + AC_SUBST(RPM) + AC_SUBST(RPM_VERSION) + + AC_SUBST(HAVE_RPMBUILD) + AC_SUBST(RPMBUILD) + AC_SUBST(RPMBUILD_VERSION) + + AC_SUBST(RPM_SPEC_DIR) + AC_SUBST(RPM_DEFINE_UTIL) + AC_SUBST(RPM_DEFINE_KMOD) + AC_SUBST(RPM_DEFINE_DKMS) + AC_SUBST(RPM_DEFINE_COMMON) + AC_SUBST(SRPM_DEFINE_UTIL) + AC_SUBST(SRPM_DEFINE_KMOD) + AC_SUBST(SRPM_DEFINE_DKMS) + AC_SUBST(SRPM_DEFINE_COMMON) +]) + +dnl # +dnl # Check for dpkg+dpkg-buildpackage to build DEB packages. If these +dnl # tools are missing it is non-fatal but you will not be able to build +dnl # DEB packages and will be warned if you try too. +dnl # +AC_DEFUN([SPL_AC_DPKG], [ + DPKG=dpkg + DPKGBUILD=dpkg-buildpackage + + AC_MSG_CHECKING([whether $DPKG is available]) + AS_IF([tmp=$($DPKG --version 2>/dev/null)], [ + DPKG_VERSION=$(echo $tmp | $AWK '/Debian/ { print $[7] }') + HAVE_DPKG=yes + AC_MSG_RESULT([$HAVE_DPKG ($DPKG_VERSION)]) + ],[ + HAVE_DPKG=no + AC_MSG_RESULT([$HAVE_DPKG]) + ]) + + AC_MSG_CHECKING([whether $DPKGBUILD is available]) + AS_IF([tmp=$($DPKGBUILD --version 2>/dev/null)], [ + DPKGBUILD_VERSION=$(echo $tmp | \ + $AWK '/Debian/ { print $[4] }' | cut -f-4 -d'.') + HAVE_DPKGBUILD=yes + AC_MSG_RESULT([$HAVE_DPKGBUILD ($DPKGBUILD_VERSION)]) + ],[ + HAVE_DPKGBUILD=no + AC_MSG_RESULT([$HAVE_DPKGBUILD]) + ]) + + AC_SUBST(HAVE_DPKG) + AC_SUBST(DPKG) + AC_SUBST(DPKG_VERSION) + + AC_SUBST(HAVE_DPKGBUILD) + AC_SUBST(DPKGBUILD) + AC_SUBST(DPKGBUILD_VERSION) +]) + +dnl # +dnl # Until native packaging for various different packing systems +dnl # can be added the least we can do is attempt to use alien to +dnl # convert the RPM packages to the needed package type. This is +dnl # a hack but so far it has worked reasonable well. +dnl # +AC_DEFUN([SPL_AC_ALIEN], [ + ALIEN=alien + + AC_MSG_CHECKING([whether $ALIEN is available]) + AS_IF([tmp=$($ALIEN --version 2>/dev/null)], [ + ALIEN_VERSION=$(echo $tmp | $AWK '{ print $[3] }') + HAVE_ALIEN=yes + AC_MSG_RESULT([$HAVE_ALIEN ($ALIEN_VERSION)]) + ],[ + HAVE_ALIEN=no + AC_MSG_RESULT([$HAVE_ALIEN]) + ]) + + AC_SUBST(HAVE_ALIEN) + AC_SUBST(ALIEN) + AC_SUBST(ALIEN_VERSION) +]) + +dnl # +dnl # Using the VENDOR tag from config.guess set the default +dnl # package type for 'make pkg': (rpm | deb | tgz) +dnl # +AC_DEFUN([SPL_AC_DEFAULT_PACKAGE], [ + AC_MSG_CHECKING([linux distribution]) + if test -f /etc/toss-release ; then + VENDOR=toss ; + elif test -f /etc/fedora-release ; then + VENDOR=fedora ; + elif test -f /etc/redhat-release ; then + VENDOR=redhat ; + elif test -f /etc/gentoo-release ; then + VENDOR=gentoo ; + elif test -f /etc/arch-release ; then + VENDOR=arch ; + elif test -f /etc/SuSE-release ; then + VENDOR=sles ; + elif test -f /etc/slackware-version ; then + VENDOR=slackware ; + elif test -f /etc/lunar.release ; then + VENDOR=lunar ; + elif test -f /etc/lsb-release ; then + VENDOR=ubuntu ; + elif test -f /etc/debian_version ; then + VENDOR=debian ; + else + VENDOR= ; + fi + AC_MSG_RESULT([$VENDOR]) + AC_SUBST(VENDOR) + + AC_MSG_CHECKING([default package type]) + case "$VENDOR" in + toss) DEFAULT_PACKAGE=rpm ;; + redhat) DEFAULT_PACKAGE=rpm ;; + fedora) DEFAULT_PACKAGE=rpm ;; + gentoo) DEFAULT_PACKAGE=tgz ;; + arch) DEFAULT_PACKAGE=tgz ;; + sles) DEFAULT_PACKAGE=rpm ;; + slackware) DEFAULT_PACKAGE=tgz ;; + lunar) DEFAULT_PACKAGE=tgz ;; + ubuntu) DEFAULT_PACKAGE=deb ;; + debian) DEFAULT_PACKAGE=deb ;; + *) DEFAULT_PACKAGE=rpm ;; + esac + + AC_MSG_RESULT([$DEFAULT_PACKAGE]) + AC_SUBST(DEFAULT_PACKAGE) +]) + +dnl # +dnl # Default SPL user configuration +dnl # +AC_DEFUN([SPL_AC_PACKAGE], [ + SPL_AC_DEFAULT_PACKAGE + SPL_AC_RPM + SPL_AC_DPKG + SPL_AC_ALIEN +]) + +AC_DEFUN([SPL_AC_LICENSE], [ + AC_MSG_CHECKING([spl author]) + AC_MSG_RESULT([$SPL_META_AUTHOR]) + + AC_MSG_CHECKING([spl license]) + AC_MSG_RESULT([$SPL_META_LICENSE]) +]) + +AC_DEFUN([SPL_AC_CONFIG], [ + SPL_CONFIG=all + AC_ARG_WITH([config], + AS_HELP_STRING([--with-config=CONFIG], + [Config file 'kernel|user|all|srpm']), + [SPL_CONFIG="$withval"]) + AC_ARG_ENABLE([linux-builtin], + [AC_HELP_STRING([--enable-linux-builtin], + [Configure for builtin in-tree kernel modules @<:@default=no@:>@])], + [], + [enable_linux_builtin=no]) + + AC_MSG_CHECKING([spl config]) + AC_MSG_RESULT([$SPL_CONFIG]); + AC_SUBST(SPL_CONFIG) + + case "$SPL_CONFIG" in + kernel) SPL_AC_CONFIG_KERNEL ;; + user) SPL_AC_CONFIG_USER ;; + all) SPL_AC_CONFIG_KERNEL + SPL_AC_CONFIG_USER ;; + srpm) ;; + *) + AC_MSG_RESULT([Error!]) + AC_MSG_ERROR([Bad value "$SPL_CONFIG" for --with-config, + user kernel|user|all|srpm]) ;; + esac + + AM_CONDITIONAL([CONFIG_USER], + [test "$SPL_CONFIG" = user -o "$SPL_CONFIG" = all]) + AM_CONDITIONAL([CONFIG_KERNEL], + [test "$SPL_CONFIG" = kernel -o "$SPL_CONFIG" = all] && + [test "x$enable_linux_builtin" != xyes ]) +]) + +dnl # +dnl # Enable if the SPL should be compiled with internal debugging enabled. +dnl # By default this support is disabled. +dnl # +AC_DEFUN([SPL_AC_DEBUG], [ + AC_MSG_CHECKING([whether debugging is enabled]) + AC_ARG_ENABLE([debug], + [AS_HELP_STRING([--enable-debug], + [Enable generic debug support @<:@default=no@:>@])], + [], + [enable_debug=no]) + + AS_IF([test "x$enable_debug" = xyes], + [ + KERNELCPPFLAGS="${KERNELCPPFLAGS} -DDEBUG -Werror" + DEBUG_CFLAGS="-DDEBUG -Werror" + DEBUG_SPL="_with_debug" + ], [ + KERNELCPPFLAGS="${KERNELCPPFLAGS} -DNDEBUG" + DEBUG_CFLAGS="-DNDEBUG" + DEBUG_SPL="_without_debug" + ]) + + AC_SUBST(DEBUG_CFLAGS) + AC_SUBST(DEBUG_SPL) + AC_MSG_RESULT([$enable_debug]) +]) + +dnl # +dnl # Enabled by default it provides a minimal level of memory tracking. +dnl # A total count of bytes allocated is kept for each alloc and free. +dnl # Then at module unload time a report to the console will be printed +dnl # if memory was leaked. +dnl # +AC_DEFUN([SPL_AC_DEBUG_KMEM], [ + AC_ARG_ENABLE([debug-kmem], + [AS_HELP_STRING([--enable-debug-kmem], + [Enable basic kmem accounting @<:@default=no@:>@])], + [], + [enable_debug_kmem=no]) + + AS_IF([test "x$enable_debug_kmem" = xyes], + [ + KERNELCPPFLAGS="${KERNELCPPFLAGS} -DDEBUG_KMEM" + DEBUG_KMEM="_with_debug_kmem" + AC_DEFINE([DEBUG_KMEM], [1], + [Define to 1 to enable basic kmem accounting]) + ], [ + DEBUG_KMEM="_without_debug_kmem" + ]) + + AC_SUBST(DEBUG_KMEM) + AC_MSG_CHECKING([whether basic kmem accounting is enabled]) + AC_MSG_RESULT([$enable_debug_kmem]) +]) + +dnl # +dnl # Disabled by default it provides detailed memory tracking. This +dnl # feature also requires --enable-debug-kmem to be set. When enabled +dnl # not only will total bytes be tracked but also the location of every +dnl # alloc and free. When the SPL module is unloaded a list of all leaked +dnl # addresses and where they were allocated will be dumped to the console. +dnl # Enabling this feature has a significant impact on performance but it +dnl # makes finding memory leaks pretty straight forward. +dnl # +AC_DEFUN([SPL_AC_DEBUG_KMEM_TRACKING], [ + AC_ARG_ENABLE([debug-kmem-tracking], + [AS_HELP_STRING([--enable-debug-kmem-tracking], + [Enable detailed kmem tracking @<:@default=no@:>@])], + [], + [enable_debug_kmem_tracking=no]) + + AS_IF([test "x$enable_debug_kmem_tracking" = xyes], + [ + KERNELCPPFLAGS="${KERNELCPPFLAGS} -DDEBUG_KMEM_TRACKING" + DEBUG_KMEM_TRACKING="_with_debug_kmem_tracking" + AC_DEFINE([DEBUG_KMEM_TRACKING], [1], + [Define to 1 to enable detailed kmem tracking]) + ], [ + DEBUG_KMEM_TRACKING="_without_debug_kmem_tracking" + ]) + + AC_SUBST(DEBUG_KMEM_TRACKING) + AC_MSG_CHECKING([whether detailed kmem tracking is enabled]) + AC_MSG_RESULT([$enable_debug_kmem_tracking]) +]) + +dnl # +dnl # SPL_LINUX_CONFTEST +dnl # +AC_DEFUN([SPL_LINUX_CONFTEST], [ +cat confdefs.h - <<_ACEOF >conftest.c +$1 +_ACEOF +]) + +dnl # +dnl # SPL_LANG_PROGRAM(C)([PROLOGUE], [BODY]) +dnl # +m4_define([SPL_LANG_PROGRAM], [ +$1 +int +main (void) +{ +dnl Do *not* indent the following line: there may be CPP directives. +dnl Don't move the `;' right after for the same reason. +$2 + ; + return 0; +} +]) + +dnl # +dnl # SPL_LINUX_COMPILE_IFELSE / like AC_COMPILE_IFELSE +dnl # +AC_DEFUN([SPL_LINUX_COMPILE_IFELSE], [ + m4_ifvaln([$1], [SPL_LINUX_CONFTEST([$1])]) + rm -Rf build && mkdir -p build && touch build/conftest.mod.c + echo "obj-m := conftest.o" >build/Makefile + modpost_flag='' + test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage + AS_IF( + [AC_TRY_COMMAND(cp conftest.c build && make [$2] -C $LINUX_OBJ EXTRA_CFLAGS="-Werror-implicit-function-declaration $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag) >/dev/null && AC_TRY_COMMAND([$3])], + [$4], + [_AC_MSG_LOG_CONFTEST m4_ifvaln([$5],[$5])] + ) + rm -Rf build +]) + +dnl # +dnl # SPL_LINUX_TRY_COMPILE like AC_TRY_COMPILE +dnl # +AC_DEFUN([SPL_LINUX_TRY_COMPILE], + [SPL_LINUX_COMPILE_IFELSE( + [AC_LANG_SOURCE([SPL_LANG_PROGRAM([[$1]], [[$2]])])], + [modules], + [test -s build/conftest.o], + [$3], [$4]) +]) + +dnl # +dnl # SPL_CHECK_SYMBOL_EXPORT +dnl # check symbol exported or not +dnl # +AC_DEFUN([SPL_CHECK_SYMBOL_EXPORT], [ + grep -q -E '[[[:space:]]]$1[[[:space:]]]' \ + $LINUX_OBJ/Module*.symvers 2>/dev/null + rc=$? + if test $rc -ne 0; then + export=0 + for file in $2; do + grep -q -E "EXPORT_SYMBOL.*($1)" \ + "$LINUX_OBJ/$file" 2>/dev/null + rc=$? + if test $rc -eq 0; then + export=1 + break; + fi + done + if test $export -eq 0; then : + $4 + else : + $3 + fi + else : + $3 + fi +]) + +dnl # +dnl # SPL_LINUX_TRY_COMPILE_SYMBOL +dnl # like SPL_LINUX_TRY_COMPILE, except SPL_CHECK_SYMBOL_EXPORT +dnl # is called if not compiling for builtin +dnl # +AC_DEFUN([SPL_LINUX_TRY_COMPILE_SYMBOL], [ + SPL_LINUX_TRY_COMPILE([$1], [$2], [rc=0], [rc=1]) + if test $rc -ne 0; then : + $6 + else + if test "x$enable_linux_builtin" != xyes; then + SPL_CHECK_SYMBOL_EXPORT([$3], [$4], [rc=0], [rc=1]) + fi + if test $rc -ne 0; then : + $6 + else : + $5 + fi + fi +]) + +dnl # +dnl # SPL_CHECK_SYMBOL_HEADER +dnl # check if a symbol prototype is defined in listed headers. +dnl # +AC_DEFUN([SPL_CHECK_SYMBOL_HEADER], [ + AC_MSG_CHECKING([whether symbol $1 exists in header]) + header=0 + for file in $3; do + grep -q "$2" "$LINUX/$file" 2>/dev/null + rc=$? + if test $rc -eq 0; then + header=1 + break; + fi + done + if test $header -eq 0; then + AC_MSG_RESULT([no]) + $5 + else + AC_MSG_RESULT([yes]) + $4 + fi +]) + +dnl # +dnl # SPL_CHECK_HEADER +dnl # check whether header exists and define HAVE_$2_HEADER +dnl # +AC_DEFUN([SPL_CHECK_HEADER], + [AC_MSG_CHECKING([whether header $1 exists]) + SPL_LINUX_TRY_COMPILE([ + #include <$1> + ],[ + return 0; + ],[ + AC_DEFINE(HAVE_$2_HEADER, 1, [$1 exists]) + AC_MSG_RESULT(yes) + $3 + ],[ + AC_MSG_RESULT(no) + $4 + ]) +]) + +dnl # +dnl # Basic toolchain sanity check. Verify that kernel modules can +dnl # be built and which symbols can be used. +dnl # +AC_DEFUN([SPL_AC_TEST_MODULE], + [AC_MSG_CHECKING([whether modules can be built]) + SPL_LINUX_TRY_COMPILE([],[],[ + AC_MSG_RESULT([yes]) + ],[ + AC_MSG_RESULT([no]) + if test "x$enable_linux_builtin" != xyes; then + AC_MSG_ERROR([*** Unable to build an empty module.]) + else + AC_MSG_ERROR([ + *** Unable to build an empty module. + *** Please run 'make scripts' inside the kernel source tree.]) + fi + ]) + + AS_IF([test "x$cross_compiling" != xyes], [ + AC_RUN_IFELSE([ + AC_LANG_PROGRAM([ + #include "$LINUX/include/linux/license.h" + ], [ + return !license_is_gpl_compatible( + "$SPL_META_LICENSE"); + ]) + ], [ + AC_DEFINE([SPL_IS_GPL_COMPATIBLE], [1], + [Define to 1 if GPL-only symbols can be used]) + ], [ + ]) + ]) +]) + +dnl # +dnl # Use the atomic implemenation based on global spinlocks. This +dnl # should only be needed by 32-bit kernels which do not provide +dnl # the atomic64_* API. It may be optionally enabled as a fallback +dnl # if problems are observed with the direct mapping to the native +dnl # Linux atomic operations. You may not disable atomic spinlocks +dnl # if you kernel does not an atomic64_* API. +dnl # +AC_DEFUN([SPL_AC_ATOMIC_SPINLOCK], [ + AC_ARG_ENABLE([atomic-spinlocks], + [AS_HELP_STRING([--enable-atomic-spinlocks], + [Atomic types use spinlocks @<:@default=check@:>@])], + [], + [enable_atomic_spinlocks=check]) + + SPL_LINUX_TRY_COMPILE([ + #include + ],[ + atomic64_t *ptr __attribute__ ((unused)); + ],[ + have_atomic64_t=yes + AC_DEFINE(HAVE_ATOMIC64_T, 1, + [kernel defines atomic64_t]) + ],[ + have_atomic64_t=no + ]) + + AS_IF([test "x$enable_atomic_spinlocks" = xcheck], [ + AS_IF([test "x$have_atomic64_t" = xyes], [ + enable_atomic_spinlocks=no + ],[ + enable_atomic_spinlocks=yes + ]) + ]) + + AS_IF([test "x$enable_atomic_spinlocks" = xyes], [ + AC_DEFINE([ATOMIC_SPINLOCK], [1], + [Atomic types use spinlocks]) + ],[ + AS_IF([test "x$have_atomic64_t" = xno], [ + AC_MSG_FAILURE( + [--disable-atomic-spinlocks given but required atomic64 support is unavailable]) + ]) + ]) + + AC_MSG_CHECKING([whether atomic types use spinlocks]) + AC_MSG_RESULT([$enable_atomic_spinlocks]) + + AC_MSG_CHECKING([whether kernel defines atomic64_t]) + AC_MSG_RESULT([$have_atomic64_t]) +]) + +AC_DEFUN([SPL_AC_SHRINKER_CALLBACK],[ + tmp_flags="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="-Werror" + dnl # + dnl # 2.6.23 to 2.6.34 API change + dnl # ->shrink(int nr_to_scan, gfp_t gfp_mask) + dnl # + AC_MSG_CHECKING([whether old 2-argument shrinker exists]) + SPL_LINUX_TRY_COMPILE([ + #include + + int shrinker_cb(int nr_to_scan, gfp_t gfp_mask); + ],[ + struct shrinker cache_shrinker = { + .shrink = shrinker_cb, + .seeks = DEFAULT_SEEKS, + }; + register_shrinker(&cache_shrinker); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_2ARGS_OLD_SHRINKER_CALLBACK, 1, + [old shrinker callback wants 2 args]) + ],[ + AC_MSG_RESULT(no) + dnl # + dnl # 2.6.35 - 2.6.39 API change + dnl # ->shrink(struct shrinker *, + dnl # int nr_to_scan, gfp_t gfp_mask) + dnl # + AC_MSG_CHECKING([whether old 3-argument shrinker exists]) + SPL_LINUX_TRY_COMPILE([ + #include + + int shrinker_cb(struct shrinker *, int nr_to_scan, + gfp_t gfp_mask); + ],[ + struct shrinker cache_shrinker = { + .shrink = shrinker_cb, + .seeks = DEFAULT_SEEKS, + }; + register_shrinker(&cache_shrinker); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_3ARGS_SHRINKER_CALLBACK, 1, + [old shrinker callback wants 3 args]) + ],[ + AC_MSG_RESULT(no) + dnl # + dnl # 3.0 - 3.11 API change + dnl # ->shrink(struct shrinker *, + dnl # struct shrink_control *sc) + dnl # + AC_MSG_CHECKING( + [whether new 2-argument shrinker exists]) + SPL_LINUX_TRY_COMPILE([ + #include + + int shrinker_cb(struct shrinker *, + struct shrink_control *sc); + ],[ + struct shrinker cache_shrinker = { + .shrink = shrinker_cb, + .seeks = DEFAULT_SEEKS, + }; + register_shrinker(&cache_shrinker); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_2ARGS_NEW_SHRINKER_CALLBACK, 1, + [new shrinker callback wants 2 args]) + ],[ + AC_MSG_RESULT(no) + dnl # + dnl # 3.12 API change, + dnl # ->shrink() is logically split in to + dnl # ->count_objects() and ->scan_objects() + dnl # + AC_MSG_CHECKING( + [whether ->count_objects callback exists]) + SPL_LINUX_TRY_COMPILE([ + #include + + unsigned long shrinker_cb( + struct shrinker *, + struct shrink_control *sc); + ],[ + struct shrinker cache_shrinker = { + .count_objects = shrinker_cb, + .scan_objects = shrinker_cb, + .seeks = DEFAULT_SEEKS, + }; + register_shrinker(&cache_shrinker); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SPLIT_SHRINKER_CALLBACK, + 1, [->count_objects exists]) + ],[ + AC_MSG_ERROR(error) + ]) + ]) + ]) + ]) + EXTRA_KCFLAGS="$tmp_flags" +]) + +dnl # +dnl # 2.6.33 API change, +dnl # Removed .ctl_name from struct ctl_table. +dnl # +AC_DEFUN([SPL_AC_CTL_NAME], [ + AC_MSG_CHECKING([whether struct ctl_table has ctl_name]) + SPL_LINUX_TRY_COMPILE([ + #include + ],[ + struct ctl_table ctl __attribute__ ((unused)); + ctl.ctl_name = 0; + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_CTL_NAME, 1, [struct ctl_table has ctl_name]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 3.10 API change, +dnl # PDE is replaced by PDE_DATA +dnl # +AC_DEFUN([SPL_AC_PDE_DATA], [ + AC_MSG_CHECKING([whether PDE_DATA() is available]) + SPL_LINUX_TRY_COMPILE_SYMBOL([ + #include + ], [ + PDE_DATA(NULL); + ], [PDE_DATA], [], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_PDE_DATA, 1, [yes]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 3.9 API change +dnl # set_fs_pwd takes const struct path * +dnl # +AC_DEFUN([SPL_AC_SET_FS_PWD_WITH_CONST], + tmp_flags="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="-Werror" + [AC_MSG_CHECKING([whether set_fs_pwd() requires const struct path *]) + SPL_LINUX_TRY_COMPILE([ + #include + #include + #include + void (*const set_fs_pwd_func) + (struct fs_struct *, const struct path *) + = set_fs_pwd; + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SET_FS_PWD_WITH_CONST, 1, + [set_fs_pwd() needs const path *]) + ],[ + SPL_LINUX_TRY_COMPILE([ + #include + #include + #include + void (*const set_fs_pwd_func) + (struct fs_struct *, struct path *) + = set_fs_pwd; + ],[ + return 0; + ],[ + AC_MSG_RESULT(no) + ],[ + AC_MSG_ERROR(unknown) + ]) + ]) + EXTRA_KCFLAGS="$tmp_flags" +]) + +dnl # +dnl # 3.13 API change +dnl # vfs_unlink() updated to take a third delegated_inode argument. +dnl # +AC_DEFUN([SPL_AC_2ARGS_VFS_UNLINK], + [AC_MSG_CHECKING([whether vfs_unlink() wants 2 args]) + SPL_LINUX_TRY_COMPILE([ + #include + ],[ + vfs_unlink((struct inode *) NULL, (struct dentry *) NULL); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_2ARGS_VFS_UNLINK, 1, + [vfs_unlink() wants 2 args]) + ],[ + AC_MSG_RESULT(no) + dnl # + dnl # Linux 3.13 API change + dnl # Added delegated inode + dnl # + AC_MSG_CHECKING([whether vfs_unlink() wants 3 args]) + SPL_LINUX_TRY_COMPILE([ + #include + ],[ + vfs_unlink((struct inode *) NULL, + (struct dentry *) NULL, + (struct inode **) NULL); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_3ARGS_VFS_UNLINK, 1, + [vfs_unlink() wants 3 args]) + ],[ + AC_MSG_ERROR(no) + ]) + + ]) +]) + +dnl # +dnl # 3.13 and 3.15 API changes +dnl # Added delegated inode and flags argument. +dnl # +AC_DEFUN([SPL_AC_4ARGS_VFS_RENAME], + [AC_MSG_CHECKING([whether vfs_rename() wants 4 args]) + SPL_LINUX_TRY_COMPILE([ + #include + ],[ + vfs_rename((struct inode *) NULL, (struct dentry *) NULL, + (struct inode *) NULL, (struct dentry *) NULL); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_4ARGS_VFS_RENAME, 1, + [vfs_rename() wants 4 args]) + ],[ + AC_MSG_RESULT(no) + dnl # + dnl # Linux 3.13 API change + dnl # Added delegated inode + dnl # + AC_MSG_CHECKING([whether vfs_rename() wants 5 args]) + SPL_LINUX_TRY_COMPILE([ + #include + ],[ + vfs_rename((struct inode *) NULL, + (struct dentry *) NULL, + (struct inode *) NULL, + (struct dentry *) NULL, + (struct inode **) NULL); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_5ARGS_VFS_RENAME, 1, + [vfs_rename() wants 5 args]) + ],[ + AC_MSG_RESULT(no) + dnl # + dnl # Linux 3.15 API change + dnl # Added flags + dnl # + AC_MSG_CHECKING([whether vfs_rename() wants 6 args]) + SPL_LINUX_TRY_COMPILE([ + #include + ],[ + vfs_rename((struct inode *) NULL, + (struct dentry *) NULL, + (struct inode *) NULL, + (struct dentry *) NULL, + (struct inode **) NULL, + (unsigned int) 0); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_6ARGS_VFS_RENAME, 1, + [vfs_rename() wants 6 args]) + ],[ + AC_MSG_ERROR(no) + ]) + ]) + ]) +]) + +dnl # +dnl # 2.6.36 API change, +dnl # The 'struct fs_struct->lock' was changed from a rwlock_t to +dnl # a spinlock_t to improve the fastpath performance. +dnl # +AC_DEFUN([SPL_AC_FS_STRUCT_SPINLOCK], [ + AC_MSG_CHECKING([whether struct fs_struct uses spinlock_t]) + tmp_flags="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="-Werror" + SPL_LINUX_TRY_COMPILE([ + #include + #include + ],[ + static struct fs_struct fs; + spin_lock_init(&fs.lock); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_FS_STRUCT_SPINLOCK, 1, + [struct fs_struct uses spinlock_t]) + ],[ + AC_MSG_RESULT(no) + ]) + EXTRA_KCFLAGS="$tmp_flags" +]) + +dnl # +dnl # User namespaces, use kuid_t in place of uid_t +dnl # where available. Not strictly a user namespaces thing +dnl # but it should prevent surprises +dnl # +AC_DEFUN([SPL_AC_KUIDGID_T], [ + AC_MSG_CHECKING([whether kuid_t/kgid_t is available]) + SPL_LINUX_TRY_COMPILE([ + #include + ], [ + kuid_t userid = KUIDT_INIT(0); + kgid_t groupid = KGIDT_INIT(0); + ],[ + SPL_LINUX_TRY_COMPILE([ + #include + ], [ + kuid_t userid = 0; + kgid_t groupid = 0; + ],[ + AC_MSG_RESULT(yes; optional) + ],[ + AC_MSG_RESULT(yes; mandatory) + AC_DEFINE(HAVE_KUIDGID_T, 1, [kuid_t/kgid_t in use]) + ]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 2.6.39 API change, +dnl # __put_task_struct() was exported by the mainline kernel. +dnl # +AC_DEFUN([SPL_AC_PUT_TASK_STRUCT], + [AC_MSG_CHECKING([whether __put_task_struct() is available]) + SPL_LINUX_TRY_COMPILE_SYMBOL([ + #include + ], [ + __put_task_struct(NULL); + ], [__put_task_struct], [], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_PUT_TASK_STRUCT, 1, + [__put_task_struct() is available]) + ], [ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 2.6.35 API change, +dnl # Unused 'struct dentry *' removed from vfs_fsync() prototype. +dnl # +AC_DEFUN([SPL_AC_2ARGS_VFS_FSYNC], [ + AC_MSG_CHECKING([whether vfs_fsync() wants 2 args]) + SPL_LINUX_TRY_COMPILE([ + #include + ],[ + vfs_fsync(NULL, 0); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_2ARGS_VFS_FSYNC, 1, [vfs_fsync() wants 2 args]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 3.5 API change, +dnl # inode_operations.truncate_range removed +dnl # +AC_DEFUN([SPL_AC_INODE_TRUNCATE_RANGE], [ + AC_MSG_CHECKING([whether truncate_range() inode operation is available]) + SPL_LINUX_TRY_COMPILE([ + #include + ],[ + struct inode_operations ops; + ops.truncate_range = NULL; + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INODE_TRUNCATE_RANGE, 1, + [truncate_range() inode operation is available]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # Linux 2.6.38 - 3.x API +dnl # +AC_DEFUN([SPL_AC_KERNEL_FILE_FALLOCATE], [ + AC_MSG_CHECKING([whether fops->fallocate() exists]) + SPL_LINUX_TRY_COMPILE([ + #include + ],[ + long (*fallocate) (struct file *, int, loff_t, loff_t) = NULL; + struct file_operations fops __attribute__ ((unused)) = { + .fallocate = fallocate, + }; + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_FILE_FALLOCATE, 1, [fops->fallocate() exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # Linux 2.6.x - 2.6.37 API +dnl # +AC_DEFUN([SPL_AC_KERNEL_INODE_FALLOCATE], [ + AC_MSG_CHECKING([whether iops->fallocate() exists]) + SPL_LINUX_TRY_COMPILE([ + #include + ],[ + long (*fallocate) (struct inode *, int, loff_t, loff_t) = NULL; + struct inode_operations fops __attribute__ ((unused)) = { + .fallocate = fallocate, + }; + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INODE_FALLOCATE, 1, [fops->fallocate() exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # PaX Linux 2.6.38 - 3.x API +dnl # +AC_DEFUN([SPL_AC_PAX_KERNEL_FILE_FALLOCATE], [ + AC_MSG_CHECKING([whether fops->fallocate() exists]) + SPL_LINUX_TRY_COMPILE([ + #include + ],[ + long (*fallocate) (struct file *, int, loff_t, loff_t) = NULL; + struct file_operations_no_const fops __attribute__ ((unused)) = { + .fallocate = fallocate, + }; + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_FILE_FALLOCATE, 1, [fops->fallocate() exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # The fallocate callback was moved from the inode_operations +dnl # structure to the file_operations structure. +dnl # +AC_DEFUN([SPL_AC_KERNEL_FALLOCATE], [ + SPL_AC_KERNEL_FILE_FALLOCATE + SPL_AC_KERNEL_INODE_FALLOCATE + SPL_AC_PAX_KERNEL_FILE_FALLOCATE +]) + +dnl # +dnl # zlib inflate compat, +dnl # Verify the kernel has CONFIG_ZLIB_INFLATE support enabled. +dnl # +AC_DEFUN([SPL_AC_CONFIG_ZLIB_INFLATE], [ + AC_MSG_CHECKING([whether CONFIG_ZLIB_INFLATE is defined]) + SPL_LINUX_TRY_COMPILE([ + #if !defined(CONFIG_ZLIB_INFLATE) && \ + !defined(CONFIG_ZLIB_INFLATE_MODULE) + #error CONFIG_ZLIB_INFLATE not defined + #endif + ],[ ],[ + AC_MSG_RESULT([yes]) + ],[ + AC_MSG_RESULT([no]) + AC_MSG_ERROR([ + *** This kernel does not include the required zlib inflate support. + *** Rebuild the kernel with CONFIG_ZLIB_INFLATE=y|m set.]) + ]) +]) + +dnl # +dnl # zlib deflate compat, +dnl # Verify the kernel has CONFIG_ZLIB_DEFLATE support enabled. +dnl # +AC_DEFUN([SPL_AC_CONFIG_ZLIB_DEFLATE], [ + AC_MSG_CHECKING([whether CONFIG_ZLIB_DEFLATE is defined]) + SPL_LINUX_TRY_COMPILE([ + #if !defined(CONFIG_ZLIB_DEFLATE) && \ + !defined(CONFIG_ZLIB_DEFLATE_MODULE) + #error CONFIG_ZLIB_DEFLATE not defined + #endif + ],[ ],[ + AC_MSG_RESULT([yes]) + ],[ + AC_MSG_RESULT([no]) + AC_MSG_ERROR([ + *** This kernel does not include the required zlib deflate support. + *** Rebuild the kernel with CONFIG_ZLIB_DEFLATE=y|m set.]) + ]) +]) + +dnl # +dnl # 2.6.39 API compat, +dnl # The function zlib_deflate_workspacesize() now take 2 arguments. +dnl # This was done to avoid always having to allocate the maximum size +dnl # workspace (268K). The caller can now specific the windowBits and +dnl # memLevel compression parameters to get a smaller workspace. +dnl # +AC_DEFUN([SPL_AC_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE], + [AC_MSG_CHECKING([whether zlib_deflate_workspacesize() wants 2 args]) + SPL_LINUX_TRY_COMPILE([ + #include + ],[ + return zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE, 1, + [zlib_deflate_workspacesize() wants 2 args]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 2.6.39 API change, +dnl # Shrinker adjust to use common shrink_control structure. +dnl # +AC_DEFUN([SPL_AC_SHRINK_CONTROL_STRUCT], [ + AC_MSG_CHECKING([whether struct shrink_control exists]) + SPL_LINUX_TRY_COMPILE([ + #include + ],[ + struct shrink_control sc __attribute__ ((unused)); + + sc.nr_to_scan = 0; + sc.gfp_mask = GFP_KERNEL; + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SHRINK_CONTROL_STRUCT, 1, + [struct shrink_control exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 3.1 API Change +dnl # +dnl # The rw_semaphore.wait_lock member was changed from spinlock_t to +dnl # raw_spinlock_t at commit ddb6c9b58a19edcfac93ac670b066c836ff729f1. +dnl # +AC_DEFUN([SPL_AC_RWSEM_SPINLOCK_IS_RAW], [ + AC_MSG_CHECKING([whether struct rw_semaphore member wait_lock is raw]) + tmp_flags="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="-Werror" + SPL_LINUX_TRY_COMPILE([ + #include + ],[ + struct rw_semaphore dummy_semaphore __attribute__ ((unused)); + raw_spinlock_t dummy_lock __attribute__ ((unused)); + dummy_semaphore.wait_lock = dummy_lock; + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(RWSEM_SPINLOCK_IS_RAW, 1, + [struct rw_semaphore member wait_lock is raw_spinlock_t]) + ],[ + AC_MSG_RESULT(no) + ]) + EXTRA_KCFLAGS="$tmp_flags" +]) + +dnl # +dnl # 3.9 API change, +dnl # Moved things from linux/sched.h to linux/sched/rt.h +dnl # +AC_DEFUN([SPL_AC_SCHED_RT_HEADER], + [AC_MSG_CHECKING([whether header linux/sched/rt.h exists]) + SPL_LINUX_TRY_COMPILE([ + #include + #include + ],[ + return 0; + ],[ + AC_DEFINE(HAVE_SCHED_RT_HEADER, 1, [linux/sched/rt.h exists]) + AC_MSG_RESULT(yes) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 3.9 API change, +dnl # vfs_getattr() uses 2 args +dnl # It takes struct path * instead of struct vfsmount * and struct dentry * +dnl # +AC_DEFUN([SPL_AC_2ARGS_VFS_GETATTR], [ + AC_MSG_CHECKING([whether vfs_getattr() wants]) + SPL_LINUX_TRY_COMPILE([ + #include + ],[ + vfs_getattr((struct path *) NULL, + (struct kstat *)NULL); + ],[ + AC_MSG_RESULT(2 args) + AC_DEFINE(HAVE_2ARGS_VFS_GETATTR, 1, + [vfs_getattr wants 2 args]) + ],[ + SPL_LINUX_TRY_COMPILE([ + #include + ],[ + vfs_getattr((struct vfsmount *)NULL, + (struct dentry *)NULL, + (struct kstat *)NULL); + ],[ + AC_MSG_RESULT(3 args) + ],[ + AC_MSG_ERROR(unknown) + ]) + ]) +]) + +dnl # +dnl # 2.6.36 API compatibility. +dnl # Added usleep_range timer. +dnl # usleep_range is a finer precision implementation of msleep +dnl # designed to be a drop-in replacement for udelay where a precise +dnl # sleep / busy-wait is unnecessary. +dnl # +AC_DEFUN([SPL_AC_USLEEP_RANGE], [ + AC_MSG_CHECKING([whether usleep_range() is available]) + SPL_LINUX_TRY_COMPILE([ + #include + ],[ + usleep_range(0, 0); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_USLEEP_RANGE, 1, + [usleep_range is available]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 2.6.35 API change, +dnl # The cachep->gfpflags member was renamed cachep->allocflags. These are +dnl # private allocation flags which are applied when allocating a new slab +dnl # in kmem_getpages(). Unfortunately there is no public API for setting +dnl # non-default flags. +dnl # +AC_DEFUN([SPL_AC_KMEM_CACHE_ALLOCFLAGS], [ + AC_MSG_CHECKING([whether struct kmem_cache has allocflags]) + SPL_LINUX_TRY_COMPILE([ + #include + ],[ + struct kmem_cache cachep __attribute__ ((unused)); + cachep.allocflags = GFP_KERNEL; + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_KMEM_CACHE_ALLOCFLAGS, 1, + [struct kmem_cache has allocflags]) + ],[ + AC_MSG_RESULT(no) + + AC_MSG_CHECKING([whether struct kmem_cache has gfpflags]) + SPL_LINUX_TRY_COMPILE([ + #include + ],[ + struct kmem_cache cachep __attribute__ ((unused)); + cachep.gfpflags = GFP_KERNEL; + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_KMEM_CACHE_GFPFLAGS, 1, + [struct kmem_cache has gfpflags]) + ],[ + AC_MSG_RESULT(no) + ]) + ]) +]) + +dnl # +dnl # 3.17 API change, +dnl # wait_on_bit() no longer requires an action argument. The former +dnl # "wait_on_bit" interface required an 'action' function to be provided +dnl # which does the actual waiting. There were over 20 such functions in the +dnl # kernel, many of them identical, though most cases can be satisfied by one +dnl # of just two functions: one which uses io_schedule() and one which just +dnl # uses schedule(). This API change was made to consolidate all of those +dnl # redundant wait functions. +dnl # +AC_DEFUN([SPL_AC_WAIT_ON_BIT], [ + AC_MSG_CHECKING([whether wait_on_bit() takes an action]) + SPL_LINUX_TRY_COMPILE([ + #include + ],[ + int (*action)(void *) = NULL; + wait_on_bit(NULL, 0, action, 0); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_WAIT_ON_BIT_ACTION, 1, [yes]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff -Naur spl-0.6.5.7/configure.ac spl-0.6.5.7.new/configure.ac --- spl-0.6.5.7/configure.ac 2015-12-24 01:18:07.000000000 +0100 +++ spl-0.6.5.7.new/configure.ac 2016-08-01 16:43:23.436766051 +0200 @@ -54,6 +54,8 @@ man/man5/Makefile lib/Makefile cmd/Makefile + cmd/splat/Makefile + cmd/splslab/Makefile module/Makefile module/spl/Makefile module/splat/Makefile diff -Naur spl-0.6.5.7/include/linux/file_compat.h spl-0.6.5.7.new/include/linux/file_compat.h --- spl-0.6.5.7/include/linux/file_compat.h 2015-12-24 01:18:07.000000000 +0100 +++ spl-0.6.5.7.new/include/linux/file_compat.h 2016-08-01 16:43:34.278796336 +0200 @@ -76,8 +76,25 @@ #define spl_filp_fsync(fp, sync) vfs_fsync(fp, (fp)->f_dentry, sync) #endif /* HAVE_2ARGS_VFS_FSYNC */ +#ifdef HAVE_INODE_LOCK_SHARED +#define spl_inode_lock(ip) inode_lock(ip) +#define spl_inode_unlock(ip) inode_unlock(ip) +#define spl_inode_lock_shared(ip) inode_lock_shared(ip) +#define spl_inode_unlock_shared(ip) inode_unlock_shared(ip) +#define spl_inode_trylock(ip) inode_trylock(ip) +#define spl_inode_trylock_shared(ip) inode_trylock_shared(ip) +#define spl_inode_is_locked(ip) inode_is_locked(ip) +#define spl_inode_lock_nested(ip, s) inode_lock_nested(ip, s) +#else #define spl_inode_lock(ip) mutex_lock(&(ip)->i_mutex) #define spl_inode_unlock(ip) mutex_unlock(&(ip)->i_mutex) +#define spl_inode_lock_shared(ip) mutex_lock(&(ip)->i_mutex) +#define spl_inode_unlock_shared(ip) mutex_unlock(&(ip)->i_mutex) +#define spl_inode_trylock(ip) mutex_trylock(&(ip)->i_mutex) +#define spl_inode_trylock_shared(ip) mutex_trylock(&(ip)->i_mutex) +#define spl_inode_is_locked(ip) mutex_is_locked(&(ip)->i_mutex) +#define spl_inode_lock_nested(ip, s) mutex_lock_nested(&(ip)->i_mutex, s) +#endif #endif /* SPL_FILE_COMPAT_H */ diff -Naur spl-0.6.5.7/include/linux/rwsem_compat.h spl-0.6.5.7.new/include/linux/rwsem_compat.h --- spl-0.6.5.7/include/linux/rwsem_compat.h 2015-09-19 21:51:19.000000000 +0200 +++ spl-0.6.5.7.new/include/linux/rwsem_compat.h 2016-08-01 16:43:34.281796344 +0200 @@ -27,6 +27,26 @@ #include +#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK +#define SPL_RWSEM_SINGLE_READER_VALUE (1) +#define SPL_RWSEM_SINGLE_WRITER_VALUE (-1) +#else +#define SPL_RWSEM_SINGLE_READER_VALUE (RWSEM_ACTIVE_READ_BIAS) +#define SPL_RWSEM_SINGLE_WRITER_VALUE (RWSEM_ACTIVE_WRITE_BIAS) +#endif + +/* Linux 3.16 changed activity to count for rwsem-spinlock */ +#if defined(HAVE_RWSEM_ACTIVITY) +#define RWSEM_COUNT(sem) sem->activity +/* Linux 4.8 changed count to an atomic_long_t for !rwsem-spinlock */ +#elif defined(HAVE_RWSEM_ATOMIC_LONG_COUNT) +#define RWSEM_COUNT(sem) atomic_long_read(&(sem)->count) +#else +#define RWSEM_COUNT(sem) sem->count +#endif + +int rwsem_tryupgrade(struct rw_semaphore *rwsem); + #if defined(RWSEM_SPINLOCK_IS_RAW) #define spl_rwsem_lock_irqsave(lk, fl) raw_spin_lock_irqsave(lk, fl) #define spl_rwsem_unlock_irqrestore(lk, fl) raw_spin_unlock_irqrestore(lk, fl) diff -Naur spl-0.6.5.7/include/sys/byteorder.h spl-0.6.5.7.new/include/sys/byteorder.h --- spl-0.6.5.7/include/sys/byteorder.h 2013-03-22 23:19:11.000000000 +0100 +++ spl-0.6.5.7.new/include/sys/byteorder.h 2016-08-01 16:43:33.091793020 +0200 @@ -26,6 +26,7 @@ #define _SPL_BYTEORDER_H #include +#include #define LE_16(x) cpu_to_le16(x) #define LE_32(x) cpu_to_le32(x) @@ -43,4 +44,26 @@ #define BE_IN32(xa) \ (((uint32_t)BE_IN16(xa) << 16) | BE_IN16((uint8_t *)(xa)+2)) +#ifdef _BIG_ENDIAN +static __inline__ uint64_t +htonll(uint64_t n) { + return (n); +} + +static __inline__ uint64_t +ntohll(uint64_t n) { + return (n); +} +#else +static __inline__ uint64_t +htonll(uint64_t n) { + return ((((uint64_t)htonl(n)) << 32) + htonl(n >> 32)); +} + +static __inline__ uint64_t +ntohll(uint64_t n) { + return ((((uint64_t)ntohl(n)) << 32) + ntohl(n >> 32)); +} +#endif + #endif /* SPL_BYTEORDER_H */ diff -Naur spl-0.6.5.7/include/sys/condvar.h spl-0.6.5.7.new/include/sys/condvar.h --- spl-0.6.5.7/include/sys/condvar.h 2015-09-19 21:51:19.000000000 +0200 +++ spl-0.6.5.7.new/include/sys/condvar.h 2016-08-01 16:43:34.276796330 +0200 @@ -59,6 +59,8 @@ extern clock_t __cv_timedwait_sig(kcondvar_t *, kmutex_t *, clock_t); extern clock_t cv_timedwait_hires(kcondvar_t *, kmutex_t *, hrtime_t, hrtime_t res, int flag); +extern clock_t cv_timedwait_sig_hires(kcondvar_t *, kmutex_t *, hrtime_t, + hrtime_t res, int flag); extern void __cv_signal(kcondvar_t *); extern void __cv_broadcast(kcondvar_t *c); diff -Naur spl-0.6.5.7/include/sys/dkioc_free_util.h spl-0.6.5.7.new/include/sys/dkioc_free_util.h --- spl-0.6.5.7/include/sys/dkioc_free_util.h 1970-01-01 01:00:00.000000000 +0100 +++ spl-0.6.5.7.new/include/sys/dkioc_free_util.h 2016-08-01 16:43:34.275796327 +0200 @@ -0,0 +1,58 @@ +/*****************************************************************************\ + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . +\*****************************************************************************/ + +#ifndef _SPL_DKIOC_UTIL_H +#define _SPL_DKIOC_UTIL_H + +#include + +typedef struct dkioc_free_list_ext_s { + uint64_t dfle_start; + uint64_t dfle_length; +} dkioc_free_list_ext_t; + +typedef struct dkioc_free_list_s { + uint64_t dfl_flags; + uint64_t dfl_num_exts; + int64_t dfl_offset; + + /* + * N.B. this is only an internal debugging API! This is only called + * from debug builds of sd for pre-release checking. Remove before GA! + */ + void (*dfl_ck_func)(uint64_t, uint64_t, void *); + void *dfl_ck_arg; + + dkioc_free_list_ext_t dfl_exts[1]; +} dkioc_free_list_t; + +static inline void dfl_free(dkioc_free_list_t *dfl) { + vmem_free(dfl, DFL_SZ(dfl->dfl_num_exts)); +} + +static inline dkioc_free_list_t *dfl_alloc(uint64_t dfl_num_exts, int flags) { + return vmem_zalloc(DFL_SZ(dfl_num_exts), flags); +} + +#endif /* _SPL_DKIOC_UTIL_H */ diff -Naur spl-0.6.5.7/include/sys/dkio.h spl-0.6.5.7.new/include/sys/dkio.h --- spl-0.6.5.7/include/sys/dkio.h 2015-12-24 01:18:07.000000000 +0100 +++ spl-0.6.5.7.new/include/sys/dkio.h 2016-08-01 16:43:23.435766048 +0200 @@ -25,14 +25,16 @@ #ifndef _SPL_DKIO_H #define _SPL_DKIO_H -struct dk_callback { - void (*dkc_callback)(void *dkc_cookie, int error); - void *dkc_cookie; - int dkc_flag; -}; +#define DFL_SZ(num_exts) \ + (sizeof (dkioc_free_list_t) + (num_exts - 1) * 16) -#define DKIOC (0x04 << 8) -#define DKIOCFLUSHWRITECACHE (DKIOC | 34) -#define DKIOCTRIM (DKIOC | 35) +#define DKIOC (0x04 << 8) +#define DKIOCFLUSHWRITECACHE (DKIOC|34) /* flush cache to phys medium */ + +/* + * ioctl to free space (e.g. SCSI UNMAP) off a disk. + * Pass a dkioc_free_list_t containing a list of extents to be freed. + */ +#define DKIOCFREE (DKIOC|50) #endif /* _SPL_DKIO_H */ diff -Naur spl-0.6.5.7/include/sys/isa_defs.h spl-0.6.5.7.new/include/sys/isa_defs.h --- spl-0.6.5.7/include/sys/isa_defs.h 2016-03-22 19:59:29.000000000 +0100 +++ spl-0.6.5.7.new/include/sys/isa_defs.h 2016-08-01 16:43:34.280796341 +0200 @@ -44,6 +44,9 @@ #define _LP64 #endif +#define _ALIGNMENT_REQUIRED 1 + + /* i386 arch specific defines */ #elif defined(__i386) || defined(__i386__) @@ -59,6 +62,8 @@ #define _ILP32 #endif +#define _ALIGNMENT_REQUIRED 0 + /* powerpc (ppc64) arch specific defines */ #elif defined(__powerpc) || defined(__powerpc__) || defined(__powerpc64__) @@ -80,6 +85,12 @@ #endif #endif +/* + * Illumos doesn't define _ALIGNMENT_REQUIRED for PPC, so default to 1 + * out of paranoia. + */ +#define _ALIGNMENT_REQUIRED 1 + /* arm arch specific defines */ #elif defined(__arm) || defined(__arm__) || defined(__aarch64__) @@ -107,6 +118,12 @@ #define _BIG_ENDIAN #endif +/* + * Illumos doesn't define _ALIGNMENT_REQUIRED for ARM, so default to 1 + * out of paranoia. + */ +#define _ALIGNMENT_REQUIRED 1 + /* sparc arch specific defines */ #elif defined(__sparc) || defined(__sparc__) @@ -130,6 +147,7 @@ #define _BIG_ENDIAN #define _SUNOS_VTOC_16 +#define _ALIGNMENT_REQUIRED 1 /* s390 arch specific defines */ #elif defined(__s390__) @@ -145,7 +163,40 @@ #define _BIG_ENDIAN -#else /* Currently x86_64, i386, arm, powerpc, s390, and sparc are supported */ +/* + * Illumos doesn't define _ALIGNMENT_REQUIRED for s390, so default to 1 + * out of paranoia. + */ +#define _ALIGNMENT_REQUIRED 1 + +/* MIPS arch specific defines */ +#elif defined(__mips__) + +#if defined(__MIPSEB__) +#define _BIG_ENDIAN +#elif defined(__MIPSEL__) +#define _LITTLE_ENDIAN +#else +#error MIPS no endian specified +#endif + +#ifndef _LP64 +#define _ILP32 +#endif + +#define _SUNOS_VTOC_16 + +/* + * Illumos doesn't define _ALIGNMENT_REQUIRED for MIPS, so default to 1 + * out of paranoia. + */ +#define _ALIGNMENT_REQUIRED 1 + +#else +/* + * Currently supported: + * x86_64, i386, arm, powerpc, s390, sparc, and mips + */ #error "Unsupported ISA type" #endif diff -Naur spl-0.6.5.7/include/sys/kobj.h spl-0.6.5.7.new/include/sys/kobj.h --- spl-0.6.5.7/include/sys/kobj.h 2013-03-22 23:19:11.000000000 +0100 +++ spl-0.6.5.7.new/include/sys/kobj.h 2016-08-01 16:43:32.403791098 +0200 @@ -35,8 +35,8 @@ extern struct _buf *kobj_open_file(const char *name); extern void kobj_close_file(struct _buf *file); -extern int kobj_read_file(struct _buf *file, char *buf, - ssize_t size, offset_t off); +extern int kobj_read_file(struct _buf *file, char *buf, unsigned size, + unsigned off); extern int kobj_get_filesize(struct _buf *file, uint64_t *size); #endif /* SPL_KOBJ_H */ diff -Naur spl-0.6.5.7/include/sys/Makefile.am spl-0.6.5.7.new/include/sys/Makefile.am --- spl-0.6.5.7/include/sys/Makefile.am 2015-12-24 01:18:07.000000000 +0100 +++ spl-0.6.5.7.new/include/sys/Makefile.am 2016-08-01 16:43:23.435766048 +0200 @@ -29,6 +29,7 @@ $(top_srcdir)/include/sys/dirent.h \ $(top_srcdir)/include/sys/disp.h \ $(top_srcdir)/include/sys/dkio.h \ + $(top_srcdir)/include/sys/dkioc_free_util.h \ $(top_srcdir)/include/sys/dklabel.h \ $(top_srcdir)/include/sys/dnlc.h \ $(top_srcdir)/include/sys/dumphdr.h \ diff -Naur spl-0.6.5.7/include/sys/mutex.h spl-0.6.5.7.new/include/sys/mutex.h --- spl-0.6.5.7/include/sys/mutex.h 2015-12-24 01:18:07.000000000 +0100 +++ spl-0.6.5.7.new/include/sys/mutex.h 2016-08-01 16:43:34.275796327 +0200 @@ -28,17 +28,25 @@ #include #include #include +#include typedef enum { MUTEX_DEFAULT = 0, MUTEX_SPIN = 1, - MUTEX_ADAPTIVE = 2 + MUTEX_ADAPTIVE = 2, + MUTEX_NOLOCKDEP = 3 } kmutex_type_t; typedef struct { struct mutex m_mutex; spinlock_t m_lock; /* used for serializing mutex_exit */ +#ifndef HAVE_MUTEX_OWNER + /* only when kernel doesn't have owner */ kthread_t *m_owner; +#endif +#ifdef CONFIG_LOCKDEP + kmutex_type_t m_type; +#endif /* CONFIG_LOCKDEP */ } kmutex_t; #define MUTEX(mp) (&((mp)->m_mutex)) @@ -46,20 +54,56 @@ static inline void spl_mutex_set_owner(kmutex_t *mp) { + /* + * kernel will handle its owner, so we don't need to do anything if it + * is defined. + */ +#ifndef HAVE_MUTEX_OWNER mp->m_owner = current; +#endif } static inline void spl_mutex_clear_owner(kmutex_t *mp) { +#ifndef HAVE_MUTEX_OWNER mp->m_owner = NULL; +#endif } +#ifdef HAVE_MUTEX_OWNER +#define mutex_owner(mp) (ACCESS_ONCE(MUTEX(mp)->owner)) +#else #define mutex_owner(mp) (ACCESS_ONCE((mp)->m_owner)) +#endif #define mutex_owned(mp) (mutex_owner(mp) == current) #define MUTEX_HELD(mp) mutex_owned(mp) #define MUTEX_NOT_HELD(mp) (!MUTEX_HELD(mp)) +#ifdef CONFIG_LOCKDEP +static inline void +spl_mutex_set_type(kmutex_t *mp, kmutex_type_t type) +{ + mp->m_type = type; +} +static inline void +spl_mutex_lockdep_off_maybe(kmutex_t *mp) \ +{ \ + if (mp && mp->m_type == MUTEX_NOLOCKDEP) \ + lockdep_off(); \ +} +static inline void +spl_mutex_lockdep_on_maybe(kmutex_t *mp) \ +{ \ + if (mp && mp->m_type == MUTEX_NOLOCKDEP) \ + lockdep_on(); \ +} +#else /* CONFIG_LOCKDEP */ +#define spl_mutex_set_type(mp, type) +#define spl_mutex_lockdep_off_maybe(mp) +#define spl_mutex_lockdep_on_maybe(mp) +#endif /* CONFIG_LOCKDEP */ + /* * The following functions must be a #define and not static inline. * This ensures that the native linux mutex functions (lock/unlock) @@ -70,11 +114,12 @@ #define mutex_init(mp, name, type, ibc) \ { \ static struct lock_class_key __key; \ - ASSERT(type == MUTEX_DEFAULT); \ + ASSERT(type == MUTEX_DEFAULT || type == MUTEX_NOLOCKDEP); \ \ __mutex_init(MUTEX(mp), (name) ? (#name) : (#mp), &__key); \ spin_lock_init(&(mp)->m_lock); \ spl_mutex_clear_owner(mp); \ + spl_mutex_set_type(mp, type); \ } #undef mutex_destroy @@ -87,8 +132,10 @@ ({ \ int _rc_; \ \ + spl_mutex_lockdep_off_maybe(mp); \ if ((_rc_ = mutex_trylock(MUTEX(mp))) == 1) \ spl_mutex_set_owner(mp); \ + spl_mutex_lockdep_on_maybe(mp); \ \ _rc_; \ }) @@ -97,14 +144,18 @@ #define mutex_enter_nested(mp, subclass) \ { \ ASSERT3P(mutex_owner(mp), !=, current); \ + spl_mutex_lockdep_off_maybe(mp); \ mutex_lock_nested(MUTEX(mp), (subclass)); \ + spl_mutex_lockdep_on_maybe(mp); \ spl_mutex_set_owner(mp); \ } #else /* CONFIG_DEBUG_LOCK_ALLOC */ #define mutex_enter_nested(mp, subclass) \ { \ ASSERT3P(mutex_owner(mp), !=, current); \ + spl_mutex_lockdep_off_maybe(mp); \ mutex_lock(MUTEX(mp)); \ + spl_mutex_lockdep_on_maybe(mp); \ spl_mutex_set_owner(mp); \ } #endif /* CONFIG_DEBUG_LOCK_ALLOC */ @@ -132,10 +183,12 @@ */ #define mutex_exit(mp) \ { \ + spl_mutex_lockdep_off_maybe(mp); \ spin_lock(&(mp)->m_lock); \ spl_mutex_clear_owner(mp); \ mutex_unlock(MUTEX(mp)); \ spin_unlock(&(mp)->m_lock); \ + spl_mutex_lockdep_on_maybe(mp); \ } int spl_mutex_init(void); diff -Naur spl-0.6.5.7/include/sys/random.h spl-0.6.5.7.new/include/sys/random.h --- spl-0.6.5.7/include/sys/random.h 2013-03-22 23:19:11.000000000 +0100 +++ spl-0.6.5.7.new/include/sys/random.h 2016-08-01 16:43:33.091793020 +0200 @@ -35,11 +35,6 @@ return 0; } -static __inline__ int -random_get_pseudo_bytes(uint8_t *ptr, size_t len) -{ - get_random_bytes((void *)ptr,(int)len); - return 0; -} +extern int random_get_pseudo_bytes(uint8_t *ptr, size_t len); #endif /* _SPL_RANDOM_H */ diff -Naur spl-0.6.5.7/include/sys/rwlock.h spl-0.6.5.7.new/include/sys/rwlock.h --- spl-0.6.5.7/include/sys/rwlock.h 2015-12-24 01:18:07.000000000 +0100 +++ spl-0.6.5.7.new/include/sys/rwlock.h 2016-08-01 16:43:34.279796338 +0200 @@ -30,55 +30,86 @@ #include typedef enum { - RW_DRIVER = 2, - RW_DEFAULT = 4 + RW_DRIVER = 2, + RW_DEFAULT = 4, + RW_NOLOCKDEP = 5 } krw_type_t; typedef enum { - RW_NONE = 0, - RW_WRITER = 1, - RW_READER = 2 + RW_NONE = 0, + RW_WRITER = 1, + RW_READER = 2 } krw_t; +/* + * If CONFIG_RWSEM_SPIN_ON_OWNER is defined, rw_semaphore will have an owner + * field, so we don't need our own. + */ typedef struct { - struct rw_semaphore rw_rwlock; - kthread_t *rw_owner; + struct rw_semaphore rw_rwlock; +#ifndef CONFIG_RWSEM_SPIN_ON_OWNER + kthread_t *rw_owner; +#endif +#ifdef CONFIG_LOCKDEP + krw_type_t rw_type; +#endif /* CONFIG_LOCKDEP */ } krwlock_t; -#define SEM(rwp) ((struct rw_semaphore *)(rwp)) +#define SEM(rwp) (&(rwp)->rw_rwlock) static inline void spl_rw_set_owner(krwlock_t *rwp) { - unsigned long flags; - - spl_rwsem_lock_irqsave(&SEM(rwp)->wait_lock, flags); - rwp->rw_owner = current; - spl_rwsem_unlock_irqrestore(&SEM(rwp)->wait_lock, flags); +/* + * If CONFIG_RWSEM_SPIN_ON_OWNER is defined, down_write, up_write, + * downgrade_write and __init_rwsem will set/clear owner for us. + */ +#ifndef CONFIG_RWSEM_SPIN_ON_OWNER + rwp->rw_owner = current; +#endif } static inline void spl_rw_clear_owner(krwlock_t *rwp) { - unsigned long flags; - - spl_rwsem_lock_irqsave(&SEM(rwp)->wait_lock, flags); - rwp->rw_owner = NULL; - spl_rwsem_unlock_irqrestore(&SEM(rwp)->wait_lock, flags); +#ifndef CONFIG_RWSEM_SPIN_ON_OWNER + rwp->rw_owner = NULL; +#endif } static inline kthread_t * rw_owner(krwlock_t *rwp) { - unsigned long flags; - kthread_t *owner; - - spl_rwsem_lock_irqsave(&SEM(rwp)->wait_lock, flags); - owner = rwp->rw_owner; - spl_rwsem_unlock_irqrestore(&SEM(rwp)->wait_lock, flags); +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER + return SEM(rwp)->owner; +#else + return rwp->rw_owner; +#endif +} - return owner; +#ifdef CONFIG_LOCKDEP +static inline void +spl_rw_set_type(krwlock_t *rwp, krw_type_t type) +{ + rwp->rw_type = type; } +static inline void +spl_rw_lockdep_off_maybe(krwlock_t *rwp) \ +{ \ + if (rwp && rwp->rw_type == RW_NOLOCKDEP) \ + lockdep_off(); \ +} +static inline void +spl_rw_lockdep_on_maybe(krwlock_t *rwp) \ +{ \ + if (rwp && rwp->rw_type == RW_NOLOCKDEP) \ + lockdep_on(); \ +} +#else /* CONFIG_LOCKDEP */ +#define spl_rw_set_type(rwp, type) +#define spl_rw_lockdep_off_maybe(rwp) +#define spl_rw_lockdep_on_maybe(rwp) +#endif /* CONFIG_LOCKDEP */ static inline int RW_READ_HELD(krwlock_t *rwp) @@ -89,7 +120,7 @@ static inline int RW_WRITE_HELD(krwlock_t *rwp) { - return (spl_rwsem_is_locked(SEM(rwp)) && rw_owner(rwp) == current); + return (rw_owner(rwp) == current); } static inline int @@ -104,107 +135,101 @@ * will be correctly located in the users code which is important * for the built in kernel lock analysis tools */ -#define rw_init(rwp, name, type, arg) \ -({ \ - static struct lock_class_key __key; \ - \ - __init_rwsem(SEM(rwp), #rwp, &__key); \ - spl_rw_clear_owner(rwp); \ -}) - -#define rw_destroy(rwp) \ -({ \ - VERIFY(!RW_LOCK_HELD(rwp)); \ -}) - -#define rw_tryenter(rwp, rw) \ -({ \ - int _rc_ = 0; \ - \ - switch (rw) { \ - case RW_READER: \ - _rc_ = down_read_trylock(SEM(rwp)); \ - break; \ - case RW_WRITER: \ - if ((_rc_ = down_write_trylock(SEM(rwp)))) \ - spl_rw_set_owner(rwp); \ - break; \ - default: \ - VERIFY(0); \ - } \ - _rc_; \ -}) - -#define rw_enter(rwp, rw) \ -({ \ - switch (rw) { \ - case RW_READER: \ - down_read(SEM(rwp)); \ - break; \ - case RW_WRITER: \ - down_write(SEM(rwp)); \ - spl_rw_set_owner(rwp); \ - break; \ - default: \ - VERIFY(0); \ - } \ -}) - -#define rw_exit(rwp) \ -({ \ - if (RW_WRITE_HELD(rwp)) { \ - spl_rw_clear_owner(rwp); \ - up_write(SEM(rwp)); \ - } else { \ - ASSERT(RW_READ_HELD(rwp)); \ - up_read(SEM(rwp)); \ - } \ -}) - -#define rw_downgrade(rwp) \ -({ \ - spl_rw_clear_owner(rwp); \ - downgrade_write(SEM(rwp)); \ +#define rw_init(rwp, name, type, arg) \ +({ \ + static struct lock_class_key __key; \ + ASSERT(type == RW_DEFAULT || type == RW_NOLOCKDEP); \ + \ + __init_rwsem(SEM(rwp), #rwp, &__key); \ + spl_rw_clear_owner(rwp); \ + spl_rw_set_type(rwp, type); \ +}) + +#define rw_destroy(rwp) \ +({ \ + VERIFY(!RW_LOCK_HELD(rwp)); \ +}) + +#define rw_tryenter(rwp, rw) \ +({ \ + int _rc_ = 0; \ + \ + spl_rw_lockdep_off_maybe(rwp); \ + switch (rw) { \ + case RW_READER: \ + _rc_ = down_read_trylock(SEM(rwp)); \ + break; \ + case RW_WRITER: \ + if ((_rc_ = down_write_trylock(SEM(rwp)))) \ + spl_rw_set_owner(rwp); \ + break; \ + default: \ + VERIFY(0); \ + } \ + spl_rw_lockdep_on_maybe(rwp); \ + _rc_; \ +}) + +#define rw_enter(rwp, rw) \ +({ \ + spl_rw_lockdep_off_maybe(rwp); \ + switch (rw) { \ + case RW_READER: \ + down_read(SEM(rwp)); \ + break; \ + case RW_WRITER: \ + down_write(SEM(rwp)); \ + spl_rw_set_owner(rwp); \ + break; \ + default: \ + VERIFY(0); \ + } \ + spl_rw_lockdep_on_maybe(rwp); \ +}) + +#define rw_exit(rwp) \ +({ \ + spl_rw_lockdep_off_maybe(rwp); \ + if (RW_WRITE_HELD(rwp)) { \ + spl_rw_clear_owner(rwp); \ + up_write(SEM(rwp)); \ + } else { \ + ASSERT(RW_READ_HELD(rwp)); \ + up_read(SEM(rwp)); \ + } \ + spl_rw_lockdep_on_maybe(rwp); \ +}) + +#define rw_downgrade(rwp) \ +({ \ + spl_rw_lockdep_off_maybe(rwp); \ + spl_rw_clear_owner(rwp); \ + downgrade_write(SEM(rwp)); \ + spl_rw_lockdep_on_maybe(rwp); \ }) -#if defined(CONFIG_RWSEM_GENERIC_SPINLOCK) /* - * For the generic implementations of rw-semaphores the following is - * true. If your semaphore implementation internally represents the - * semaphore state differently then special case handling is required. - * - if activity/count is 0 then there are no active readers or writers - * - if activity/count is +ve then that is the number of active readers - * - if activity/count is -1 then there is one active writer + * This implementation of rw_tryupgrade() behaves slightly differently + * from its counterparts on other platforms. It drops the RW_READER lock + * and then acquires the RW_WRITER lock leaving a small window where no + * lock is held. On other platforms the lock is never released during + * the upgrade process. This is necessary under Linux because the kernel + * does not provide an upgrade function. */ - -extern void __up_read_locked(struct rw_semaphore *); -extern int __down_write_trylock_locked(struct rw_semaphore *); - -#define rw_tryupgrade(rwp) \ -({ \ - unsigned long _flags_; \ - int _rc_ = 0; \ - \ - spl_rwsem_lock_irqsave(&SEM(rwp)->wait_lock, _flags_); \ - if ((list_empty(&SEM(rwp)->wait_list)) && \ - (SEM(rwp)->activity == 1)) { \ - __up_read_locked(SEM(rwp)); \ - VERIFY(_rc_ = __down_write_trylock_locked(SEM(rwp))); \ - (rwp)->rw_owner = current; \ - } \ - spl_rwsem_unlock_irqrestore(&SEM(rwp)->wait_lock, _flags_); \ - _rc_; \ +#define rw_tryupgrade(rwp) \ +({ \ + int _rc_ = 0; \ + \ + if (RW_WRITE_HELD(rwp)) { \ + _rc_ = 1; \ + } else { \ + spl_rw_lockdep_off_maybe(rwp); \ + if ((_rc_ = rwsem_tryupgrade(SEM(rwp)))) \ + spl_rw_set_owner(rwp); \ + spl_rw_lockdep_on_maybe(rwp); \ + } \ + _rc_; \ }) -#else -/* - * rw_tryupgrade() can be implemented correctly but for each supported - * arch we will need a custom implementation. For the x86 implementation - * it looks like a custom cmpxchg() to atomically check and promote the - * rwsem would be safe. For now that's not worth the trouble so in this - * case rw_tryupgrade() has just been disabled. - */ -#define rw_tryupgrade(rwp) ({ 0; }) -#endif int spl_rw_init(void); void spl_rw_fini(void); diff -Naur spl-0.6.5.7/include/sys/sunldi.h spl-0.6.5.7.new/include/sys/sunldi.h --- spl-0.6.5.7/include/sys/sunldi.h 2013-03-22 23:19:11.000000000 +0100 +++ spl-0.6.5.7.new/include/sys/sunldi.h 2016-08-01 16:43:33.091793020 +0200 @@ -34,23 +34,4 @@ #define SECTOR_SIZE 512 -typedef struct modlinkage { - int ml_rev; - struct modlfs *ml_modlfs; - struct modldrv *ml_modldrv; - major_t ml_major; - unsigned ml_minors; - void *pad1; -} modlinkage_t; - -typedef struct ldi_ident { - char li_modname[MAXNAMELEN]; - dev_t li_dev; -} *ldi_ident_t; - -typedef struct block_device *ldi_handle_t; - -extern int ldi_ident_from_mod(struct modlinkage *modlp, ldi_ident_t *lip); -extern void ldi_ident_release(ldi_ident_t li); - #endif /* SPL_SUNLDI_H */ diff -Naur spl-0.6.5.7/include/sys/sysmacros.h spl-0.6.5.7.new/include/sys/sysmacros.h --- spl-0.6.5.7/include/sys/sysmacros.h 2015-12-24 01:31:01.000000000 +0100 +++ spl-0.6.5.7.new/include/sys/sysmacros.h 2016-08-01 16:43:34.280796341 +0200 @@ -158,6 +158,9 @@ extern void spl_setup(void); extern void spl_cleanup(void); +#define highbit(x) __fls(x) +#define lowbit(x) __ffs(x) + #define highbit64(x) fls64(x) #define makedevice(maj,min) makedev(maj,min) diff -Naur spl-0.6.5.7/include/sys/taskq.h spl-0.6.5.7.new/include/sys/taskq.h --- spl-0.6.5.7/include/sys/taskq.h 2015-12-24 01:18:07.000000000 +0100 +++ spl-0.6.5.7.new/include/sys/taskq.h 2016-08-01 16:43:33.091793020 +0200 @@ -1,4 +1,4 @@ -/*****************************************************************************\ +/* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -20,10 +20,10 @@ * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . -\*****************************************************************************/ + */ #ifndef _SPL_TASKQ_H -#define _SPL_TASKQ_H +#define _SPL_TASKQ_H #include #include @@ -32,55 +32,67 @@ #include #include #include +#include -#define TASKQ_NAMELEN 31 +#define TASKQ_NAMELEN 31 -#define TASKQ_PREPOPULATE 0x00000001 -#define TASKQ_CPR_SAFE 0x00000002 -#define TASKQ_DYNAMIC 0x00000004 -#define TASKQ_THREADS_CPU_PCT 0x00000008 -#define TASKQ_DC_BATCH 0x00000010 -#define TASKQ_ACTIVE 0x80000000 +#define TASKQ_PREPOPULATE 0x00000001 +#define TASKQ_CPR_SAFE 0x00000002 +#define TASKQ_DYNAMIC 0x00000004 +#define TASKQ_THREADS_CPU_PCT 0x00000008 +#define TASKQ_DC_BATCH 0x00000010 +#define TASKQ_ACTIVE 0x80000000 /* * Flags for taskq_dispatch. TQ_SLEEP/TQ_NOSLEEP should be same as * KM_SLEEP/KM_NOSLEEP. TQ_NOQUEUE/TQ_NOALLOC are set particularly * large so as not to conflict with already used GFP_* defines. */ -#define TQ_SLEEP 0x00000000 -#define TQ_NOSLEEP 0x00000001 -#define TQ_PUSHPAGE 0x00000002 -#define TQ_NOQUEUE 0x01000000 -#define TQ_NOALLOC 0x02000000 -#define TQ_NEW 0x04000000 -#define TQ_FRONT 0x08000000 +#define TQ_SLEEP 0x00000000 +#define TQ_NOSLEEP 0x00000001 +#define TQ_PUSHPAGE 0x00000002 +#define TQ_NOQUEUE 0x01000000 +#define TQ_NOALLOC 0x02000000 +#define TQ_NEW 0x04000000 +#define TQ_FRONT 0x08000000 + +/* + * spin_lock(lock) and spin_lock_nested(lock,0) are equivalent, + * so TQ_LOCK_DYNAMIC must not evaluate to 0 + */ +typedef enum tq_lock_role { + TQ_LOCK_GENERAL = 0, + TQ_LOCK_DYNAMIC = 1, +} tq_lock_role_t; typedef unsigned long taskqid_t; typedef void (task_func_t)(void *); typedef struct taskq { - spinlock_t tq_lock; /* protects taskq_t */ - unsigned long tq_lock_flags; /* interrupt state */ - char *tq_name; /* taskq name */ - struct list_head tq_thread_list;/* list of all threads */ - struct list_head tq_active_list;/* list of active threads */ - int tq_nactive; /* # of active threads */ - int tq_nthreads; /* # of existing threads */ - int tq_nspawn; /* # of threads being spawned */ - int tq_maxthreads; /* # of threads maximum */ - int tq_pri; /* priority */ - int tq_minalloc; /* min task_t pool size */ - int tq_maxalloc; /* max task_t pool size */ - int tq_nalloc; /* cur task_t pool size */ - uint_t tq_flags; /* flags */ - taskqid_t tq_next_id; /* next pend/work id */ - taskqid_t tq_lowest_id; /* lowest pend/work id */ - struct list_head tq_free_list; /* free task_t's */ - struct list_head tq_pend_list; /* pending task_t's */ - struct list_head tq_prio_list; /* priority pending task_t's */ - struct list_head tq_delay_list; /* delayed task_t's */ - wait_queue_head_t tq_work_waitq; /* new work waitq */ - wait_queue_head_t tq_wait_waitq; /* wait waitq */ + spinlock_t tq_lock; /* protects taskq_t */ + char *tq_name; /* taskq name */ + int tq_instance; /* instance of tq_name */ + struct list_head tq_thread_list; /* list of all threads */ + struct list_head tq_active_list; /* list of active threads */ + int tq_nactive; /* # of active threads */ + int tq_nthreads; /* # of existing threads */ + int tq_nspawn; /* # of threads being spawned */ + int tq_maxthreads; /* # of threads maximum */ + int tq_pri; /* priority */ + int tq_minalloc; /* min taskq_ent_t pool size */ + int tq_maxalloc; /* max taskq_ent_t pool size */ + int tq_nalloc; /* cur taskq_ent_t pool size */ + uint_t tq_flags; /* flags */ + taskqid_t tq_next_id; /* next pend/work id */ + taskqid_t tq_lowest_id; /* lowest pend/work id */ + struct list_head tq_free_list; /* free taskq_ent_t's */ + struct list_head tq_pend_list; /* pending taskq_ent_t's */ + struct list_head tq_prio_list; /* priority pending taskq_ent_t's */ + struct list_head tq_delay_list; /* delayed taskq_ent_t's */ + struct list_head tq_taskqs; /* all taskq_t's */ + wait_queue_head_t tq_work_waitq; /* new work waitq */ + wait_queue_head_t tq_wait_waitq; /* wait waitq */ + tq_lock_role_t tq_lock_class; /* class when taking tq_lock */ } taskq_t; typedef struct taskq_ent { @@ -93,10 +105,11 @@ void *tqent_arg; taskq_t *tqent_taskq; uintptr_t tqent_flags; + unsigned long tqent_birth; } taskq_ent_t; -#define TQENT_FLAG_PREALLOC 0x1 -#define TQENT_FLAG_CANCEL 0x2 +#define TQENT_FLAG_PREALLOC 0x1 +#define TQENT_FLAG_CANCEL 0x2 typedef struct taskq_thread { struct list_head tqt_thread_list; @@ -111,6 +124,10 @@ /* Global system-wide dynamic task queue available for all consumers */ extern taskq_t *system_taskq; +/* List of all taskqs */ +extern struct list_head tq_list; +extern struct rw_semaphore tq_list_sem; + extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *, uint_t, clock_t); @@ -124,11 +141,11 @@ extern void taskq_wait_outstanding(taskq_t *, taskqid_t); extern void taskq_wait(taskq_t *); extern int taskq_cancel_id(taskq_t *, taskqid_t); -extern int taskq_member(taskq_t *, void *); +extern int taskq_member(taskq_t *, kthread_t *); -#define taskq_create_proc(name, nthreads, pri, min, max, proc, flags) \ +#define taskq_create_proc(name, nthreads, pri, min, max, proc, flags) \ taskq_create(name, nthreads, pri, min, max, flags) -#define taskq_create_sysdc(name, nthreads, min, max, proc, dc, flags) \ +#define taskq_create_sysdc(name, nthreads, min, max, proc, dc, flags) \ taskq_create(name, nthreads, maxclsyspri, min, max, flags) int spl_taskq_init(void); diff -Naur spl-0.6.5.7/include/sys/time.h spl-0.6.5.7.new/include/sys/time.h --- spl-0.6.5.7/include/sys/time.h 2015-12-24 01:31:01.000000000 +0100 +++ spl-0.6.5.7.new/include/sys/time.h 2016-08-01 16:43:34.276796330 +0200 @@ -46,6 +46,9 @@ #define MSEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MILLISEC)) #define NSEC2MSEC(n) ((n) / (NANOSEC / MILLISEC)) +#define NSEC2SEC(n) ((n) / (NANOSEC / SEC)) +#define SEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / SEC)) + static const int hz = HZ; #define TIMESPEC_OVERFLOW(ts) \ diff -Naur spl-0.6.5.7/include/sys/tsd.h spl-0.6.5.7.new/include/sys/tsd.h --- spl-0.6.5.7/include/sys/tsd.h 2013-03-22 23:19:11.000000000 +0100 +++ spl-0.6.5.7.new/include/sys/tsd.h 2016-08-01 16:43:31.475788506 +0200 @@ -35,6 +35,7 @@ extern int tsd_set(uint_t, void *); extern void *tsd_get(uint_t); +extern void *tsd_get_by_thread(uint_t, kthread_t *); extern void tsd_create(uint_t *, dtor_func_t); extern void tsd_destroy(uint_t *); extern void tsd_exit(void); diff -Naur spl-0.6.5.7/include/sys/user.h spl-0.6.5.7.new/include/sys/user.h --- spl-0.6.5.7/include/sys/user.h 2015-12-24 01:18:07.000000000 +0100 +++ spl-0.6.5.7.new/include/sys/user.h 2016-08-01 16:43:24.019767680 +0200 @@ -30,8 +30,8 @@ * about the Linux task_struct. Since this is internal to our compatibility * layer, we make it an opaque type. * - * XXX: If the descriptor changes under us, we would get an incorrect - * reference. + * XXX: If the descriptor changes under us and we do not do a getf() between + * the change and using it, we would get an incorrect reference. */ struct uf_info; diff -Naur spl-0.6.5.7/man/man5/spl-module-parameters.5 spl-0.6.5.7.new/man/man5/spl-module-parameters.5 --- spl-0.6.5.7/man/man5/spl-module-parameters.5 2015-12-17 18:46:53.000000000 +0100 +++ spl-0.6.5.7.new/man/man5/spl-module-parameters.5 2016-08-01 16:43:33.091793020 +0200 @@ -44,6 +44,20 @@ .sp .ne 2 .na +\fBspl_kmem_cache_kmem_threads\fR (uint) +.ad +.RS 12n +The number of threads created for the spl_kmem_cache task queue. This task +queue is responsible for allocating new slabs for use by the kmem caches. +For the majority of systems and workloads only a small number of threads are +required. +.sp +Default value: \fB4\fR +.RE + +.sp +.ne 2 +.na \fBspl_kmem_cache_reclaim\fR (uint) .ad .RS 12n @@ -237,6 +251,20 @@ .RE .sp +.ne 2 +.na +\fBspl_taskq_kick\fR (uint) +.ad +.RS 12n +Kick stuck taskq to spawn threads. When writing a non-zero value to it, it will +scan all the taskqs. If any of them have a pending task more than 5 seconds old, +it will kick it to spawn more threads. This can be used if you find a rare +deadlock occurs because one or more taskqs didn't spawn a thread when it should. +.sp +Default value: \fB0\fR +.RE + +.sp .ne 2 .na \fBspl_taskq_thread_bind\fR (int) diff -Naur spl-0.6.5.7/module/spl/spl-condvar.c spl-0.6.5.7.new/module/spl/spl-condvar.c --- spl-0.6.5.7/module/spl/spl-condvar.c 2016-05-13 04:46:57.000000000 +0200 +++ spl-0.6.5.7.new/module/spl/spl-condvar.c 2016-08-01 16:43:34.278796336 +0200 @@ -26,6 +26,7 @@ #include #include +#include void __cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg) @@ -238,7 +239,7 @@ DEFINE_WAIT(wait); kmutex_t *m; hrtime_t time_left, now; - unsigned long time_left_us; + ktime_t ktime_left; ASSERT(cvp); ASSERT(mp); @@ -258,7 +259,6 @@ atomic_dec(&cvp->cv_refs); return (-1); } - time_left_us = time_left / NSEC_PER_USEC; prepare_to_wait_exclusive(&cvp->cv_event, &wait, state); atomic_inc(&cvp->cv_waiters); @@ -273,7 +273,9 @@ * Allow a 100 us range to give kernel an opportunity to coalesce * interrupts */ - usleep_range(time_left_us, time_left_us + 100); + ktime_left = ktime_set(0, time_left); + schedule_hrtimeout_range(&ktime_left, 100 * NSEC_PER_USEC, + HRTIMER_MODE_REL); /* No more waiters a different mutex could be used */ if (atomic_dec_and_test(&cvp->cv_waiters)) { @@ -290,15 +292,15 @@ mutex_enter(mp); time_left = expire_time - gethrtime(); - return (time_left > 0 ? time_left : -1); + return (time_left > 0 ? NSEC_TO_TICK(time_left) : -1); } /* * Compatibility wrapper for the cv_timedwait_hires() Illumos interface. */ -clock_t -cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res, - int flag) +static clock_t +cv_timedwait_hires_common(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res, + int flag, int state) { if (res > 1) { /* @@ -312,10 +314,27 @@ if (!(flag & CALLOUT_FLAG_ABSOLUTE)) tim += gethrtime(); - return (__cv_timedwait_hires(cvp, mp, tim, TASK_UNINTERRUPTIBLE)); + return (__cv_timedwait_hires(cvp, mp, tim, state)); +} + +clock_t +cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res, + int flag) +{ + return (cv_timedwait_hires_common(cvp, mp, tim, res, flag, + TASK_UNINTERRUPTIBLE)); } EXPORT_SYMBOL(cv_timedwait_hires); +clock_t +cv_timedwait_sig_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res, + int flag) +{ + return (cv_timedwait_hires_common(cvp, mp, tim, res, flag, + TASK_INTERRUPTIBLE)); +} +EXPORT_SYMBOL(cv_timedwait_sig_hires); + void __cv_signal(kcondvar_t *cvp) { diff -Naur spl-0.6.5.7/module/spl/spl-generic.c spl-0.6.5.7.new/module/spl/spl-generic.c --- spl-0.6.5.7/module/spl/spl-generic.c 2016-05-13 04:46:57.000000000 +0200 +++ spl-0.6.5.7.new/module/spl/spl-generic.c 2016-08-01 16:43:33.091793020 +0200 @@ -41,6 +41,8 @@ #include #include #include +#include +#include #include #include #include @@ -56,6 +58,112 @@ proc_t p0 = { 0 }; EXPORT_SYMBOL(p0); +/* + * Xorshift Pseudo Random Number Generator based on work by Sebastiano Vigna + * + * "Further scramblings of Marsaglia's xorshift generators" + * http://vigna.di.unimi.it/ftp/papers/xorshiftplus.pdf + * + * random_get_pseudo_bytes() is an API function on Illumos whose sole purpose + * is to provide bytes containing random numbers. It is mapped to /dev/urandom + * on Illumos, which uses a "FIPS 186-2 algorithm". No user of the SPL's + * random_get_pseudo_bytes() needs bytes that are of cryptographic quality, so + * we can implement it using a fast PRNG that we seed using Linux' actual + * equivalent to random_get_pseudo_bytes(). We do this by providing each CPU + * with an independent seed so that all calls to random_get_pseudo_bytes() are + * free of atomic instructions. + * + * A consequence of using a fast PRNG is that using random_get_pseudo_bytes() + * to generate words larger than 128 bits will paradoxically be limited to + * `2^128 - 1` possibilities. This is because we have a sequence of `2^128 - 1` + * 128-bit words and selecting the first will implicitly select the second. If + * a caller finds this behavior undesireable, random_get_bytes() should be used + * instead. + * + * XXX: Linux interrupt handlers that trigger within the critical section + * formed by `s[1] = xp[1];` and `xp[0] = s[0];` and call this function will + * see the same numbers. Nothing in the code currently calls this in an + * interrupt handler, so this is considered to be okay. If that becomes a + * problem, we could create a set of per-cpu variables for interrupt handlers + * and use them when in_interrupt() from linux/preempt_mask.h evaluates to + * true. + */ +static DEFINE_PER_CPU(uint64_t[2], spl_pseudo_entropy); + +/* + * spl_rand_next()/spl_rand_jump() are copied from the following CC-0 licensed + * file: + * + * http://xorshift.di.unimi.it/xorshift128plus.c + */ + +static inline uint64_t +spl_rand_next(uint64_t *s) { + uint64_t s1 = s[0]; + const uint64_t s0 = s[1]; + s[0] = s0; + s1 ^= s1 << 23; // a + s[1] = s1 ^ s0 ^ (s1 >> 18) ^ (s0 >> 5); // b, c + return (s[1] + s0); +} + +static inline void +spl_rand_jump(uint64_t *s) { + static const uint64_t JUMP[] = { 0x8a5cd789635d2dff, 0x121fd2155c472f96 }; + + uint64_t s0 = 0; + uint64_t s1 = 0; + int i, b; + for(i = 0; i < sizeof JUMP / sizeof *JUMP; i++) + for(b = 0; b < 64; b++) { + if (JUMP[i] & 1ULL << b) { + s0 ^= s[0]; + s1 ^= s[1]; + } + (void) spl_rand_next(s); + } + + s[0] = s0; + s[1] = s1; +} + +int +random_get_pseudo_bytes(uint8_t *ptr, size_t len) +{ + uint64_t *xp, s[2]; + + ASSERT(ptr); + + xp = get_cpu_var(spl_pseudo_entropy); + + s[0] = xp[0]; + s[1] = xp[1]; + + while (len) { + union { + uint64_t ui64; + uint8_t byte[sizeof (uint64_t)]; + }entropy; + int i = MIN(len, sizeof (uint64_t)); + + len -= i; + entropy.ui64 = spl_rand_next(s); + + while (i--) + *ptr++ = entropy.byte[i]; + } + + xp[0] = s[0]; + xp[1] = s[1]; + + put_cpu_var(spl_pseudo_entropy); + + return (0); +} + + +EXPORT_SYMBOL(random_get_pseudo_bytes); + #if BITS_PER_LONG == 32 /* * Support 64/64 => 64 division on a 32-bit platform. While the kernel @@ -491,29 +599,58 @@ rc = spl_kmem_init(); if (rc) - goto out1; + return (rc); rc = spl_vmem_init(); - if (rc) - goto out2; - - rc = spl_kmem_cache_init(); - if (rc) - goto out3; + if (rc) { + spl_kmem_fini(); + return (rc); + } return (rc); -out3: - spl_vmem_fini(); -out2: - spl_kmem_fini(); -out1: - return (rc); +} + +/* + * We initialize the random number generator with 128 bits of entropy from the + * system random number generator. In the improbable case that we have a zero + * seed, we fallback to the system jiffies, unless it is also zero, in which + * situation we use a preprogrammed seed. We step forward by 2^64 iterations to + * initialize each of the per-cpu seeds so that the sequences generated on each + * CPU are guaranteed to never overlap in practice. + */ +static void __init +spl_random_init(void) +{ + uint64_t s[2]; + int i; + + get_random_bytes(s, sizeof (s)); + + if (s[0] == 0 && s[1] == 0) { + if (jiffies != 0) { + s[0] = jiffies; + s[1] = ~0 - jiffies; + } else { + (void) memcpy(s, "improbable seed", sizeof (s)); + } + printk("SPL: get_random_bytes() returned 0 " + "when generating random seed. Setting initial seed to " + "0x%016llx%016llx.", cpu_to_be64(s[0]), cpu_to_be64(s[1])); + } + + for (i = 0; i < NR_CPUS; i++) { + uint64_t *wordp = per_cpu(spl_pseudo_entropy, i); + + spl_rand_jump(s); + + wordp[0] = s[0]; + wordp[1] = s[1]; + } } static void spl_kvmem_fini(void) { - spl_kmem_cache_fini(); spl_vmem_fini(); spl_kmem_fini(); } @@ -523,6 +660,8 @@ { int rc = 0; + spl_random_init(); + if ((rc = spl_kvmem_init())) goto out1; @@ -532,38 +671,43 @@ if ((rc = spl_rw_init())) goto out3; - if ((rc = spl_taskq_init())) + if ((rc = spl_tsd_init())) goto out4; - if ((rc = spl_vn_init())) + if ((rc = spl_taskq_init())) goto out5; - if ((rc = spl_proc_init())) + if ((rc = spl_kmem_cache_init())) goto out6; - if ((rc = spl_kstat_init())) + if ((rc = spl_vn_init())) goto out7; - if ((rc = spl_tsd_init())) + if ((rc = spl_proc_init())) goto out8; - if ((rc = spl_zlib_init())) + if ((rc = spl_kstat_init())) goto out9; + if ((rc = spl_zlib_init())) + goto out10; + printk(KERN_NOTICE "SPL: Loaded module v%s-%s%s\n", SPL_META_VERSION, SPL_META_RELEASE, SPL_DEBUG_STR); return (rc); +out10: + spl_kstat_fini(); out9: - spl_tsd_fini(); + spl_proc_fini(); out8: - spl_kstat_fini(); + spl_vn_fini(); out7: - spl_proc_fini(); + spl_kmem_cache_fini(); out6: - spl_vn_fini(); -out5: spl_taskq_fini(); +out5: + spl_tsd_fini(); out4: spl_rw_fini(); out3: @@ -584,11 +728,12 @@ printk(KERN_NOTICE "SPL: Unloaded module v%s-%s%s\n", SPL_META_VERSION, SPL_META_RELEASE, SPL_DEBUG_STR); spl_zlib_fini(); - spl_tsd_fini(); spl_kstat_fini(); spl_proc_fini(); spl_vn_fini(); + spl_kmem_cache_fini(); spl_taskq_fini(); + spl_tsd_fini(); spl_rw_fini(); spl_mutex_fini(); spl_kvmem_fini(); diff -Naur spl-0.6.5.7/module/spl/spl-kmem-cache.c spl-0.6.5.7.new/module/spl/spl-kmem-cache.c --- spl-0.6.5.7/module/spl/spl-kmem-cache.c 2016-05-13 04:46:56.000000000 +0200 +++ spl-0.6.5.7.new/module/spl/spl-kmem-cache.c 2016-08-01 16:43:34.280796341 +0200 @@ -88,7 +88,7 @@ unsigned int spl_kmem_cache_magazine_size = 0; module_param(spl_kmem_cache_magazine_size, uint, 0444); MODULE_PARM_DESC(spl_kmem_cache_magazine_size, - "Default magazine size (2-256), set automatically (0)\n"); + "Default magazine size (2-256), set automatically (0)"); /* * The default behavior is to report the number of objects remaining in the @@ -1149,15 +1149,13 @@ * It is responsible for allocating a new slab, linking it in to the list * of partial slabs, and then waking any waiters. */ -static void -spl_cache_grow_work(void *data) +static int +__spl_cache_grow(spl_kmem_cache_t *skc, int flags) { - spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data; - spl_kmem_cache_t *skc = ska->ska_cache; spl_kmem_slab_t *sks; fstrans_cookie_t cookie = spl_fstrans_mark(); - sks = spl_slab_alloc(skc, ska->ska_flags); + sks = spl_slab_alloc(skc, flags); spl_fstrans_unmark(cookie); spin_lock(&skc->skc_lock); @@ -1165,15 +1163,29 @@ skc->skc_slab_total++; skc->skc_obj_total += sks->sks_objs; list_add_tail(&sks->sks_list, &skc->skc_partial_list); + + smp_mb__before_atomic(); + clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); + smp_mb__after_atomic(); + wake_up_all(&skc->skc_waitq); } + spin_unlock(&skc->skc_lock); + + return (sks == NULL ? -ENOMEM : 0); +} + +static void +spl_cache_grow_work(void *data) +{ + spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data; + spl_kmem_cache_t *skc = ska->ska_cache; + + (void)__spl_cache_grow(skc, ska->ska_flags); atomic_dec(&skc->skc_ref); smp_mb__before_atomic(); clear_bit(KMC_BIT_GROWING, &skc->skc_flags); - clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); smp_mb__after_atomic(); - wake_up_all(&skc->skc_waitq); - spin_unlock(&skc->skc_lock); kfree(ska); } @@ -1214,6 +1226,21 @@ } /* + * To reduce the overhead of context switch and improve NUMA locality, + * it tries to allocate a new slab in the current process context with + * KM_NOSLEEP flag. If it fails, it will launch a new taskq to do the + * allocation. + * + * However, this can't be applied to KVM_VMEM due to a bug that + * __vmalloc() doesn't honor gfp flags in page table allocation. + */ + if (!(skc->skc_flags & KMC_VMEM)) { + rc = __spl_cache_grow(skc, flags | KM_NOSLEEP); + if (rc == 0) + return (0); + } + + /* * This is handled by dispatching a work request to the global work * queue. This allows us to asynchronously allocate a new slab while * retaining the ability to safely fall back to a smaller synchronous diff -Naur spl-0.6.5.7/module/spl/spl-kmem-cache.c.orig spl-0.6.5.7.new/module/spl/spl-kmem-cache.c.orig --- spl-0.6.5.7/module/spl/spl-kmem-cache.c.orig 1970-01-01 01:00:00.000000000 +0100 +++ spl-0.6.5.7.new/module/spl/spl-kmem-cache.c.orig 2016-05-13 04:46:56.000000000 +0200 @@ -0,0 +1,1734 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Within the scope of spl-kmem.c file the kmem_cache_* definitions + * are removed to allow access to the real Linux slab allocator. + */ +#undef kmem_cache_destroy +#undef kmem_cache_create +#undef kmem_cache_alloc +#undef kmem_cache_free + + +/* + * Linux 3.16 replaced smp_mb__{before,after}_{atomic,clear}_{dec,inc,bit}() + * with smp_mb__{before,after}_atomic() because they were redundant. This is + * only used inside our SLAB allocator, so we implement an internal wrapper + * here to give us smp_mb__{before,after}_atomic() on older kernels. + */ +#ifndef smp_mb__before_atomic +#define smp_mb__before_atomic(x) smp_mb__before_clear_bit(x) +#endif + +#ifndef smp_mb__after_atomic +#define smp_mb__after_atomic(x) smp_mb__after_clear_bit(x) +#endif + +/* + * Cache expiration was implemented because it was part of the default Solaris + * kmem_cache behavior. The idea is that per-cpu objects which haven't been + * accessed in several seconds should be returned to the cache. On the other + * hand Linux slabs never move objects back to the slabs unless there is + * memory pressure on the system. By default the Linux method is enabled + * because it has been shown to improve responsiveness on low memory systems. + * This policy may be changed by setting KMC_EXPIRE_AGE or KMC_EXPIRE_MEM. + */ +unsigned int spl_kmem_cache_expire = KMC_EXPIRE_MEM; +EXPORT_SYMBOL(spl_kmem_cache_expire); +module_param(spl_kmem_cache_expire, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_expire, "By age (0x1) or low memory (0x2)"); + +/* + * Cache magazines are an optimization designed to minimize the cost of + * allocating memory. They do this by keeping a per-cpu cache of recently + * freed objects, which can then be reallocated without taking a lock. This + * can improve performance on highly contended caches. However, because + * objects in magazines will prevent otherwise empty slabs from being + * immediately released this may not be ideal for low memory machines. + * + * For this reason spl_kmem_cache_magazine_size can be used to set a maximum + * magazine size. When this value is set to 0 the magazine size will be + * automatically determined based on the object size. Otherwise magazines + * will be limited to 2-256 objects per magazine (i.e per cpu). Magazines + * may never be entirely disabled in this implementation. + */ +unsigned int spl_kmem_cache_magazine_size = 0; +module_param(spl_kmem_cache_magazine_size, uint, 0444); +MODULE_PARM_DESC(spl_kmem_cache_magazine_size, + "Default magazine size (2-256), set automatically (0)\n"); + +/* + * The default behavior is to report the number of objects remaining in the + * cache. This allows the Linux VM to repeatedly reclaim objects from the + * cache when memory is low satisfy other memory allocations. Alternately, + * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache + * is reclaimed. This may increase the likelihood of out of memory events. + */ +unsigned int spl_kmem_cache_reclaim = 0 /* KMC_RECLAIM_ONCE */; +module_param(spl_kmem_cache_reclaim, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)"); + +unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB; +module_param(spl_kmem_cache_obj_per_slab, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab"); + +unsigned int spl_kmem_cache_obj_per_slab_min = SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN; +module_param(spl_kmem_cache_obj_per_slab_min, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab_min, + "Minimal number of objects per slab"); + +unsigned int spl_kmem_cache_max_size = SPL_KMEM_CACHE_MAX_SIZE; +module_param(spl_kmem_cache_max_size, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB"); + +/* + * For small objects the Linux slab allocator should be used to make the most + * efficient use of the memory. However, large objects are not supported by + * the Linux slab and therefore the SPL implementation is preferred. A cutoff + * of 16K was determined to be optimal for architectures using 4K pages. + */ +#if PAGE_SIZE == 4096 +unsigned int spl_kmem_cache_slab_limit = 16384; +#else +unsigned int spl_kmem_cache_slab_limit = 0; +#endif +module_param(spl_kmem_cache_slab_limit, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_slab_limit, + "Objects less than N bytes use the Linux slab"); + +/* + * This value defaults to a threshold designed to avoid allocations which + * have been deemed costly by the kernel. + */ +unsigned int spl_kmem_cache_kmem_limit = + ((1 << (PAGE_ALLOC_COSTLY_ORDER - 1)) * PAGE_SIZE) / + SPL_KMEM_CACHE_OBJ_PER_SLAB; +module_param(spl_kmem_cache_kmem_limit, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_kmem_limit, + "Objects less than N bytes use the kmalloc"); + +/* + * The number of threads available to allocate new slabs for caches. This + * should not need to be tuned but it is available for performance analysis. + */ +unsigned int spl_kmem_cache_kmem_threads = 4; +module_param(spl_kmem_cache_kmem_threads, uint, 0444); +MODULE_PARM_DESC(spl_kmem_cache_kmem_threads, + "Number of spl_kmem_cache threads"); + +/* + * Slab allocation interfaces + * + * While the Linux slab implementation was inspired by the Solaris + * implementation I cannot use it to emulate the Solaris APIs. I + * require two features which are not provided by the Linux slab. + * + * 1) Constructors AND destructors. Recent versions of the Linux + * kernel have removed support for destructors. This is a deal + * breaker for the SPL which contains particularly expensive + * initializers for mutex's, condition variables, etc. We also + * require a minimal level of cleanup for these data types unlike + * many Linux data types which do need to be explicitly destroyed. + * + * 2) Virtual address space backed slab. Callers of the Solaris slab + * expect it to work well for both small are very large allocations. + * Because of memory fragmentation the Linux slab which is backed + * by kmalloc'ed memory performs very badly when confronted with + * large numbers of large allocations. Basing the slab on the + * virtual address space removes the need for contiguous pages + * and greatly improve performance for large allocations. + * + * For these reasons, the SPL has its own slab implementation with + * the needed features. It is not as highly optimized as either the + * Solaris or Linux slabs, but it should get me most of what is + * needed until it can be optimized or obsoleted by another approach. + * + * One serious concern I do have about this method is the relatively + * small virtual address space on 32bit arches. This will seriously + * constrain the size of the slab caches and their performance. + */ + +struct list_head spl_kmem_cache_list; /* List of caches */ +struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */ +taskq_t *spl_kmem_cache_taskq; /* Task queue for ageing / reclaim */ + +static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj); + +SPL_SHRINKER_CALLBACK_FWD_DECLARE(spl_kmem_cache_generic_shrinker); +SPL_SHRINKER_DECLARE(spl_kmem_cache_shrinker, + spl_kmem_cache_generic_shrinker, KMC_DEFAULT_SEEKS); + +static void * +kv_alloc(spl_kmem_cache_t *skc, int size, int flags) +{ + gfp_t lflags = kmem_flags_convert(flags); + void *ptr; + + if (skc->skc_flags & KMC_KMEM) { + ASSERT(ISP2(size)); + ptr = (void *)__get_free_pages(lflags, get_order(size)); + } else { + ptr = __vmalloc(size, lflags | __GFP_HIGHMEM, PAGE_KERNEL); + } + + /* Resulting allocated memory will be page aligned */ + ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); + + return (ptr); +} + +static void +kv_free(spl_kmem_cache_t *skc, void *ptr, int size) +{ + ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); + + /* + * The Linux direct reclaim path uses this out of band value to + * determine if forward progress is being made. Normally this is + * incremented by kmem_freepages() which is part of the various + * Linux slab implementations. However, since we are using none + * of that infrastructure we are responsible for incrementing it. + */ + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT; + + if (skc->skc_flags & KMC_KMEM) { + ASSERT(ISP2(size)); + free_pages((unsigned long)ptr, get_order(size)); + } else { + vfree(ptr); + } +} + +/* + * Required space for each aligned sks. + */ +static inline uint32_t +spl_sks_size(spl_kmem_cache_t *skc) +{ + return (P2ROUNDUP_TYPED(sizeof (spl_kmem_slab_t), + skc->skc_obj_align, uint32_t)); +} + +/* + * Required space for each aligned object. + */ +static inline uint32_t +spl_obj_size(spl_kmem_cache_t *skc) +{ + uint32_t align = skc->skc_obj_align; + + return (P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) + + P2ROUNDUP_TYPED(sizeof (spl_kmem_obj_t), align, uint32_t)); +} + +/* + * Lookup the spl_kmem_object_t for an object given that object. + */ +static inline spl_kmem_obj_t * +spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj) +{ + return (obj + P2ROUNDUP_TYPED(skc->skc_obj_size, + skc->skc_obj_align, uint32_t)); +} + +/* + * Required space for each offslab object taking in to account alignment + * restrictions and the power-of-two requirement of kv_alloc(). + */ +static inline uint32_t +spl_offslab_size(spl_kmem_cache_t *skc) +{ + return (1UL << (fls64(spl_obj_size(skc)) + 1)); +} + +/* + * It's important that we pack the spl_kmem_obj_t structure and the + * actual objects in to one large address space to minimize the number + * of calls to the allocator. It is far better to do a few large + * allocations and then subdivide it ourselves. Now which allocator + * we use requires balancing a few trade offs. + * + * For small objects we use kmem_alloc() because as long as you are + * only requesting a small number of pages (ideally just one) its cheap. + * However, when you start requesting multiple pages with kmem_alloc() + * it gets increasingly expensive since it requires contiguous pages. + * For this reason we shift to vmem_alloc() for slabs of large objects + * which removes the need for contiguous pages. We do not use + * vmem_alloc() in all cases because there is significant locking + * overhead in __get_vm_area_node(). This function takes a single + * global lock when acquiring an available virtual address range which + * serializes all vmem_alloc()'s for all slab caches. Using slightly + * different allocation functions for small and large objects should + * give us the best of both worlds. + * + * KMC_ONSLAB KMC_OFFSLAB + * + * +------------------------+ +-----------------+ + * | spl_kmem_slab_t --+-+ | | spl_kmem_slab_t |---+-+ + * | skc_obj_size <-+ | | +-----------------+ | | + * | spl_kmem_obj_t | | | | + * | skc_obj_size <---+ | +-----------------+ | | + * | spl_kmem_obj_t | | | skc_obj_size | <-+ | + * | ... v | | spl_kmem_obj_t | | + * +------------------------+ +-----------------+ v + */ +static spl_kmem_slab_t * +spl_slab_alloc(spl_kmem_cache_t *skc, int flags) +{ + spl_kmem_slab_t *sks; + spl_kmem_obj_t *sko, *n; + void *base, *obj; + uint32_t obj_size, offslab_size = 0; + int i, rc = 0; + + base = kv_alloc(skc, skc->skc_slab_size, flags); + if (base == NULL) + return (NULL); + + sks = (spl_kmem_slab_t *)base; + sks->sks_magic = SKS_MAGIC; + sks->sks_objs = skc->skc_slab_objs; + sks->sks_age = jiffies; + sks->sks_cache = skc; + INIT_LIST_HEAD(&sks->sks_list); + INIT_LIST_HEAD(&sks->sks_free_list); + sks->sks_ref = 0; + obj_size = spl_obj_size(skc); + + if (skc->skc_flags & KMC_OFFSLAB) + offslab_size = spl_offslab_size(skc); + + for (i = 0; i < sks->sks_objs; i++) { + if (skc->skc_flags & KMC_OFFSLAB) { + obj = kv_alloc(skc, offslab_size, flags); + if (!obj) { + rc = -ENOMEM; + goto out; + } + } else { + obj = base + spl_sks_size(skc) + (i * obj_size); + } + + ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align)); + sko = spl_sko_from_obj(skc, obj); + sko->sko_addr = obj; + sko->sko_magic = SKO_MAGIC; + sko->sko_slab = sks; + INIT_LIST_HEAD(&sko->sko_list); + list_add_tail(&sko->sko_list, &sks->sks_free_list); + } + +out: + if (rc) { + if (skc->skc_flags & KMC_OFFSLAB) + list_for_each_entry_safe(sko, + n, &sks->sks_free_list, sko_list) + kv_free(skc, sko->sko_addr, offslab_size); + + kv_free(skc, base, skc->skc_slab_size); + sks = NULL; + } + + return (sks); +} + +/* + * Remove a slab from complete or partial list, it must be called with + * the 'skc->skc_lock' held but the actual free must be performed + * outside the lock to prevent deadlocking on vmem addresses. + */ +static void +spl_slab_free(spl_kmem_slab_t *sks, + struct list_head *sks_list, struct list_head *sko_list) +{ + spl_kmem_cache_t *skc; + + ASSERT(sks->sks_magic == SKS_MAGIC); + ASSERT(sks->sks_ref == 0); + + skc = sks->sks_cache; + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(spin_is_locked(&skc->skc_lock)); + + /* + * Update slab/objects counters in the cache, then remove the + * slab from the skc->skc_partial_list. Finally add the slab + * and all its objects in to the private work lists where the + * destructors will be called and the memory freed to the system. + */ + skc->skc_obj_total -= sks->sks_objs; + skc->skc_slab_total--; + list_del(&sks->sks_list); + list_add(&sks->sks_list, sks_list); + list_splice_init(&sks->sks_free_list, sko_list); +} + +/* + * Reclaim empty slabs at the end of the partial list. + */ +static void +spl_slab_reclaim(spl_kmem_cache_t *skc) +{ + spl_kmem_slab_t *sks, *m; + spl_kmem_obj_t *sko, *n; + LIST_HEAD(sks_list); + LIST_HEAD(sko_list); + uint32_t size = 0; + + /* + * Empty slabs and objects must be moved to a private list so they + * can be safely freed outside the spin lock. All empty slabs are + * at the end of skc->skc_partial_list, therefore once a non-empty + * slab is found we can stop scanning. + */ + spin_lock(&skc->skc_lock); + list_for_each_entry_safe_reverse(sks, m, + &skc->skc_partial_list, sks_list) { + + if (sks->sks_ref > 0) + break; + + spl_slab_free(sks, &sks_list, &sko_list); + } + spin_unlock(&skc->skc_lock); + + /* + * The following two loops ensure all the object destructors are + * run, any offslab objects are freed, and the slabs themselves + * are freed. This is all done outside the skc->skc_lock since + * this allows the destructor to sleep, and allows us to perform + * a conditional reschedule when a freeing a large number of + * objects and slabs back to the system. + */ + if (skc->skc_flags & KMC_OFFSLAB) + size = spl_offslab_size(skc); + + list_for_each_entry_safe(sko, n, &sko_list, sko_list) { + ASSERT(sko->sko_magic == SKO_MAGIC); + + if (skc->skc_flags & KMC_OFFSLAB) + kv_free(skc, sko->sko_addr, size); + } + + list_for_each_entry_safe(sks, m, &sks_list, sks_list) { + ASSERT(sks->sks_magic == SKS_MAGIC); + kv_free(skc, sks, skc->skc_slab_size); + } +} + +static spl_kmem_emergency_t * +spl_emergency_search(struct rb_root *root, void *obj) +{ + struct rb_node *node = root->rb_node; + spl_kmem_emergency_t *ske; + unsigned long address = (unsigned long)obj; + + while (node) { + ske = container_of(node, spl_kmem_emergency_t, ske_node); + + if (address < ske->ske_obj) + node = node->rb_left; + else if (address > ske->ske_obj) + node = node->rb_right; + else + return (ske); + } + + return (NULL); +} + +static int +spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + spl_kmem_emergency_t *ske_tmp; + unsigned long address = ske->ske_obj; + + while (*new) { + ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node); + + parent = *new; + if (address < ske_tmp->ske_obj) + new = &((*new)->rb_left); + else if (address > ske_tmp->ske_obj) + new = &((*new)->rb_right); + else + return (0); + } + + rb_link_node(&ske->ske_node, parent, new); + rb_insert_color(&ske->ske_node, root); + + return (1); +} + +/* + * Allocate a single emergency object and track it in a red black tree. + */ +static int +spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj) +{ + gfp_t lflags = kmem_flags_convert(flags); + spl_kmem_emergency_t *ske; + int order = get_order(skc->skc_obj_size); + int empty; + + /* Last chance use a partial slab if one now exists */ + spin_lock(&skc->skc_lock); + empty = list_empty(&skc->skc_partial_list); + spin_unlock(&skc->skc_lock); + if (!empty) + return (-EEXIST); + + ske = kmalloc(sizeof (*ske), lflags); + if (ske == NULL) + return (-ENOMEM); + + ske->ske_obj = __get_free_pages(lflags, order); + if (ske->ske_obj == 0) { + kfree(ske); + return (-ENOMEM); + } + + spin_lock(&skc->skc_lock); + empty = spl_emergency_insert(&skc->skc_emergency_tree, ske); + if (likely(empty)) { + skc->skc_obj_total++; + skc->skc_obj_emergency++; + if (skc->skc_obj_emergency > skc->skc_obj_emergency_max) + skc->skc_obj_emergency_max = skc->skc_obj_emergency; + } + spin_unlock(&skc->skc_lock); + + if (unlikely(!empty)) { + free_pages(ske->ske_obj, order); + kfree(ske); + return (-EINVAL); + } + + *obj = (void *)ske->ske_obj; + + return (0); +} + +/* + * Locate the passed object in the red black tree and free it. + */ +static int +spl_emergency_free(spl_kmem_cache_t *skc, void *obj) +{ + spl_kmem_emergency_t *ske; + int order = get_order(skc->skc_obj_size); + + spin_lock(&skc->skc_lock); + ske = spl_emergency_search(&skc->skc_emergency_tree, obj); + if (ske) { + rb_erase(&ske->ske_node, &skc->skc_emergency_tree); + skc->skc_obj_emergency--; + skc->skc_obj_total--; + } + spin_unlock(&skc->skc_lock); + + if (ske == NULL) + return (-ENOENT); + + free_pages(ske->ske_obj, order); + kfree(ske); + + return (0); +} + +/* + * Release objects from the per-cpu magazine back to their slab. The flush + * argument contains the max number of entries to remove from the magazine. + */ +static void +__spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush) +{ + int i, count = MIN(flush, skm->skm_avail); + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(skm->skm_magic == SKM_MAGIC); + ASSERT(spin_is_locked(&skc->skc_lock)); + + for (i = 0; i < count; i++) + spl_cache_shrink(skc, skm->skm_objs[i]); + + skm->skm_avail -= count; + memmove(skm->skm_objs, &(skm->skm_objs[count]), + sizeof (void *) * skm->skm_avail); +} + +static void +spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush) +{ + spin_lock(&skc->skc_lock); + __spl_cache_flush(skc, skm, flush); + spin_unlock(&skc->skc_lock); +} + +static void +spl_magazine_age(void *data) +{ + spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data; + spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()]; + + ASSERT(skm->skm_magic == SKM_MAGIC); + ASSERT(skm->skm_cpu == smp_processor_id()); + ASSERT(irqs_disabled()); + + /* There are no available objects or they are too young to age out */ + if ((skm->skm_avail == 0) || + time_before(jiffies, skm->skm_age + skc->skc_delay * HZ)) + return; + + /* + * Because we're executing in interrupt context we may have + * interrupted the holder of this lock. To avoid a potential + * deadlock return if the lock is contended. + */ + if (!spin_trylock(&skc->skc_lock)) + return; + + __spl_cache_flush(skc, skm, skm->skm_refill); + spin_unlock(&skc->skc_lock); +} + +/* + * Called regularly to keep a downward pressure on the cache. + * + * Objects older than skc->skc_delay seconds in the per-cpu magazines will + * be returned to the caches. This is done to prevent idle magazines from + * holding memory which could be better used elsewhere. The delay is + * present to prevent thrashing the magazine. + * + * The newly released objects may result in empty partial slabs. Those + * slabs should be released to the system. Otherwise moving the objects + * out of the magazines is just wasted work. + */ +static void +spl_cache_age(void *data) +{ + spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data; + taskqid_t id = 0; + + ASSERT(skc->skc_magic == SKC_MAGIC); + + /* Dynamically disabled at run time */ + if (!(spl_kmem_cache_expire & KMC_EXPIRE_AGE)) + return; + + atomic_inc(&skc->skc_ref); + + if (!(skc->skc_flags & KMC_NOMAGAZINE)) + on_each_cpu(spl_magazine_age, skc, 1); + + spl_slab_reclaim(skc); + + while (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && !id) { + id = taskq_dispatch_delay( + spl_kmem_cache_taskq, spl_cache_age, skc, TQ_SLEEP, + ddi_get_lbolt() + skc->skc_delay / 3 * HZ); + + /* Destroy issued after dispatch immediately cancel it */ + if (test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && id) + taskq_cancel_id(spl_kmem_cache_taskq, id); + } + + spin_lock(&skc->skc_lock); + skc->skc_taskqid = id; + spin_unlock(&skc->skc_lock); + + atomic_dec(&skc->skc_ref); +} + +/* + * Size a slab based on the size of each aligned object plus spl_kmem_obj_t. + * When on-slab we want to target spl_kmem_cache_obj_per_slab. However, + * for very small objects we may end up with more than this so as not + * to waste space in the minimal allocation of a single page. Also for + * very large objects we may use as few as spl_kmem_cache_obj_per_slab_min, + * lower than this and we will fail. + */ +static int +spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size) +{ + uint32_t sks_size, obj_size, max_size, tgt_size, tgt_objs; + + if (skc->skc_flags & KMC_OFFSLAB) { + tgt_objs = spl_kmem_cache_obj_per_slab; + tgt_size = P2ROUNDUP(sizeof (spl_kmem_slab_t), PAGE_SIZE); + + if ((skc->skc_flags & KMC_KMEM) && + (spl_obj_size(skc) > (SPL_MAX_ORDER_NR_PAGES * PAGE_SIZE))) + return (-ENOSPC); + } else { + sks_size = spl_sks_size(skc); + obj_size = spl_obj_size(skc); + max_size = (spl_kmem_cache_max_size * 1024 * 1024); + tgt_size = (spl_kmem_cache_obj_per_slab * obj_size + sks_size); + + /* + * KMC_KMEM slabs are allocated by __get_free_pages() which + * rounds up to the nearest order. Knowing this the size + * should be rounded up to the next power of two with a hard + * maximum defined by the maximum allowed allocation order. + */ + if (skc->skc_flags & KMC_KMEM) { + max_size = SPL_MAX_ORDER_NR_PAGES * PAGE_SIZE; + tgt_size = MIN(max_size, + PAGE_SIZE * (1 << MAX(get_order(tgt_size) - 1, 1))); + } + + if (tgt_size <= max_size) { + tgt_objs = (tgt_size - sks_size) / obj_size; + } else { + tgt_objs = (max_size - sks_size) / obj_size; + tgt_size = (tgt_objs * obj_size) + sks_size; + } + } + + if (tgt_objs == 0) + return (-ENOSPC); + + *objs = tgt_objs; + *size = tgt_size; + + return (0); +} + +/* + * Make a guess at reasonable per-cpu magazine size based on the size of + * each object and the cost of caching N of them in each magazine. Long + * term this should really adapt based on an observed usage heuristic. + */ +static int +spl_magazine_size(spl_kmem_cache_t *skc) +{ + uint32_t obj_size = spl_obj_size(skc); + int size; + + if (spl_kmem_cache_magazine_size > 0) + return (MAX(MIN(spl_kmem_cache_magazine_size, 256), 2)); + + /* Per-magazine sizes below assume a 4Kib page size */ + if (obj_size > (PAGE_SIZE * 256)) + size = 4; /* Minimum 4Mib per-magazine */ + else if (obj_size > (PAGE_SIZE * 32)) + size = 16; /* Minimum 2Mib per-magazine */ + else if (obj_size > (PAGE_SIZE)) + size = 64; /* Minimum 256Kib per-magazine */ + else if (obj_size > (PAGE_SIZE / 4)) + size = 128; /* Minimum 128Kib per-magazine */ + else + size = 256; + + return (size); +} + +/* + * Allocate a per-cpu magazine to associate with a specific core. + */ +static spl_kmem_magazine_t * +spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu) +{ + spl_kmem_magazine_t *skm; + int size = sizeof (spl_kmem_magazine_t) + + sizeof (void *) * skc->skc_mag_size; + + skm = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); + if (skm) { + skm->skm_magic = SKM_MAGIC; + skm->skm_avail = 0; + skm->skm_size = skc->skc_mag_size; + skm->skm_refill = skc->skc_mag_refill; + skm->skm_cache = skc; + skm->skm_age = jiffies; + skm->skm_cpu = cpu; + } + + return (skm); +} + +/* + * Free a per-cpu magazine associated with a specific core. + */ +static void +spl_magazine_free(spl_kmem_magazine_t *skm) +{ + ASSERT(skm->skm_magic == SKM_MAGIC); + ASSERT(skm->skm_avail == 0); + kfree(skm); +} + +/* + * Create all pre-cpu magazines of reasonable sizes. + */ +static int +spl_magazine_create(spl_kmem_cache_t *skc) +{ + int i; + + if (skc->skc_flags & KMC_NOMAGAZINE) + return (0); + + skc->skc_mag = kzalloc(sizeof (spl_kmem_magazine_t *) * + num_possible_cpus(), kmem_flags_convert(KM_SLEEP)); + skc->skc_mag_size = spl_magazine_size(skc); + skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2; + + for_each_possible_cpu(i) { + skc->skc_mag[i] = spl_magazine_alloc(skc, i); + if (!skc->skc_mag[i]) { + for (i--; i >= 0; i--) + spl_magazine_free(skc->skc_mag[i]); + + kfree(skc->skc_mag); + return (-ENOMEM); + } + } + + return (0); +} + +/* + * Destroy all pre-cpu magazines. + */ +static void +spl_magazine_destroy(spl_kmem_cache_t *skc) +{ + spl_kmem_magazine_t *skm; + int i; + + if (skc->skc_flags & KMC_NOMAGAZINE) + return; + + for_each_possible_cpu(i) { + skm = skc->skc_mag[i]; + spl_cache_flush(skc, skm, skm->skm_avail); + spl_magazine_free(skm); + } + + kfree(skc->skc_mag); +} + +/* + * Create a object cache based on the following arguments: + * name cache name + * size cache object size + * align cache object alignment + * ctor cache object constructor + * dtor cache object destructor + * reclaim cache object reclaim + * priv cache private data for ctor/dtor/reclaim + * vmp unused must be NULL + * flags + * KMC_NOTOUCH Disable cache object aging (unsupported) + * KMC_NODEBUG Disable debugging (unsupported) + * KMC_NOHASH Disable hashing (unsupported) + * KMC_QCACHE Disable qcache (unsupported) + * KMC_NOMAGAZINE Enabled for kmem/vmem, Disabled for Linux slab + * KMC_KMEM Force kmem backed cache + * KMC_VMEM Force vmem backed cache + * KMC_SLAB Force Linux slab backed cache + * KMC_OFFSLAB Locate objects off the slab + */ +spl_kmem_cache_t * +spl_kmem_cache_create(char *name, size_t size, size_t align, + spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, spl_kmem_reclaim_t reclaim, + void *priv, void *vmp, int flags) +{ + gfp_t lflags = kmem_flags_convert(KM_SLEEP); + spl_kmem_cache_t *skc; + int rc; + + /* + * Unsupported flags + */ + ASSERT0(flags & KMC_NOMAGAZINE); + ASSERT0(flags & KMC_NOHASH); + ASSERT0(flags & KMC_QCACHE); + ASSERT(vmp == NULL); + + might_sleep(); + + skc = kzalloc(sizeof (*skc), lflags); + if (skc == NULL) + return (NULL); + + skc->skc_magic = SKC_MAGIC; + skc->skc_name_size = strlen(name) + 1; + skc->skc_name = (char *)kmalloc(skc->skc_name_size, lflags); + if (skc->skc_name == NULL) { + kfree(skc); + return (NULL); + } + strncpy(skc->skc_name, name, skc->skc_name_size); + + skc->skc_ctor = ctor; + skc->skc_dtor = dtor; + skc->skc_reclaim = reclaim; + skc->skc_private = priv; + skc->skc_vmp = vmp; + skc->skc_linux_cache = NULL; + skc->skc_flags = flags; + skc->skc_obj_size = size; + skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN; + skc->skc_delay = SPL_KMEM_CACHE_DELAY; + skc->skc_reap = SPL_KMEM_CACHE_REAP; + atomic_set(&skc->skc_ref, 0); + + INIT_LIST_HEAD(&skc->skc_list); + INIT_LIST_HEAD(&skc->skc_complete_list); + INIT_LIST_HEAD(&skc->skc_partial_list); + skc->skc_emergency_tree = RB_ROOT; + spin_lock_init(&skc->skc_lock); + init_waitqueue_head(&skc->skc_waitq); + skc->skc_slab_fail = 0; + skc->skc_slab_create = 0; + skc->skc_slab_destroy = 0; + skc->skc_slab_total = 0; + skc->skc_slab_alloc = 0; + skc->skc_slab_max = 0; + skc->skc_obj_total = 0; + skc->skc_obj_alloc = 0; + skc->skc_obj_max = 0; + skc->skc_obj_deadlock = 0; + skc->skc_obj_emergency = 0; + skc->skc_obj_emergency_max = 0; + + /* + * Verify the requested alignment restriction is sane. + */ + if (align) { + VERIFY(ISP2(align)); + VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN); + VERIFY3U(align, <=, PAGE_SIZE); + skc->skc_obj_align = align; + } + + /* + * When no specific type of slab is requested (kmem, vmem, or + * linuxslab) then select a cache type based on the object size + * and default tunables. + */ + if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB))) { + + /* + * Objects smaller than spl_kmem_cache_slab_limit can + * use the Linux slab for better space-efficiency. By + * default this functionality is disabled until its + * performance characteristics are fully understood. + */ + if (spl_kmem_cache_slab_limit && + size <= (size_t)spl_kmem_cache_slab_limit) + skc->skc_flags |= KMC_SLAB; + + /* + * Small objects, less than spl_kmem_cache_kmem_limit per + * object should use kmem because their slabs are small. + */ + else if (spl_obj_size(skc) <= spl_kmem_cache_kmem_limit) + skc->skc_flags |= KMC_KMEM; + + /* + * All other objects are considered large and are placed + * on vmem backed slabs. + */ + else + skc->skc_flags |= KMC_VMEM; + } + + /* + * Given the type of slab allocate the required resources. + */ + if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) { + rc = spl_slab_size(skc, + &skc->skc_slab_objs, &skc->skc_slab_size); + if (rc) + goto out; + + rc = spl_magazine_create(skc); + if (rc) + goto out; + } else { + unsigned long slabflags = 0; + + if (size > (SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE)) { + rc = EINVAL; + goto out; + } + +#if defined(SLAB_USERCOPY) + /* + * Required for PAX-enabled kernels if the slab is to be + * used for coping between user and kernel space. + */ + slabflags |= SLAB_USERCOPY; +#endif + + skc->skc_linux_cache = kmem_cache_create( + skc->skc_name, size, align, slabflags, NULL); + if (skc->skc_linux_cache == NULL) { + rc = ENOMEM; + goto out; + } + +#if defined(HAVE_KMEM_CACHE_ALLOCFLAGS) + skc->skc_linux_cache->allocflags |= __GFP_COMP; +#elif defined(HAVE_KMEM_CACHE_GFPFLAGS) + skc->skc_linux_cache->gfpflags |= __GFP_COMP; +#endif + skc->skc_flags |= KMC_NOMAGAZINE; + } + + if (spl_kmem_cache_expire & KMC_EXPIRE_AGE) + skc->skc_taskqid = taskq_dispatch_delay(spl_kmem_cache_taskq, + spl_cache_age, skc, TQ_SLEEP, + ddi_get_lbolt() + skc->skc_delay / 3 * HZ); + + down_write(&spl_kmem_cache_sem); + list_add_tail(&skc->skc_list, &spl_kmem_cache_list); + up_write(&spl_kmem_cache_sem); + + return (skc); +out: + kfree(skc->skc_name); + kfree(skc); + return (NULL); +} +EXPORT_SYMBOL(spl_kmem_cache_create); + +/* + * Register a move callback for cache defragmentation. + * XXX: Unimplemented but harmless to stub out for now. + */ +void +spl_kmem_cache_set_move(spl_kmem_cache_t *skc, + kmem_cbrc_t (move)(void *, void *, size_t, void *)) +{ + ASSERT(move != NULL); +} +EXPORT_SYMBOL(spl_kmem_cache_set_move); + +/* + * Destroy a cache and all objects associated with the cache. + */ +void +spl_kmem_cache_destroy(spl_kmem_cache_t *skc) +{ + DECLARE_WAIT_QUEUE_HEAD(wq); + taskqid_t id; + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB)); + + down_write(&spl_kmem_cache_sem); + list_del_init(&skc->skc_list); + up_write(&spl_kmem_cache_sem); + + /* Cancel any and wait for any pending delayed tasks */ + VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags)); + + spin_lock(&skc->skc_lock); + id = skc->skc_taskqid; + spin_unlock(&skc->skc_lock); + + taskq_cancel_id(spl_kmem_cache_taskq, id); + + /* + * Wait until all current callers complete, this is mainly + * to catch the case where a low memory situation triggers a + * cache reaping action which races with this destroy. + */ + wait_event(wq, atomic_read(&skc->skc_ref) == 0); + + if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) { + spl_magazine_destroy(skc); + spl_slab_reclaim(skc); + } else { + ASSERT(skc->skc_flags & KMC_SLAB); + kmem_cache_destroy(skc->skc_linux_cache); + } + + spin_lock(&skc->skc_lock); + + /* + * Validate there are no objects in use and free all the + * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. + */ + ASSERT3U(skc->skc_slab_alloc, ==, 0); + ASSERT3U(skc->skc_obj_alloc, ==, 0); + ASSERT3U(skc->skc_slab_total, ==, 0); + ASSERT3U(skc->skc_obj_total, ==, 0); + ASSERT3U(skc->skc_obj_emergency, ==, 0); + ASSERT(list_empty(&skc->skc_complete_list)); + + spin_unlock(&skc->skc_lock); + + kfree(skc->skc_name); + kfree(skc); +} +EXPORT_SYMBOL(spl_kmem_cache_destroy); + +/* + * Allocate an object from a slab attached to the cache. This is used to + * repopulate the per-cpu magazine caches in batches when they run low. + */ +static void * +spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks) +{ + spl_kmem_obj_t *sko; + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(sks->sks_magic == SKS_MAGIC); + ASSERT(spin_is_locked(&skc->skc_lock)); + + sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list); + ASSERT(sko->sko_magic == SKO_MAGIC); + ASSERT(sko->sko_addr != NULL); + + /* Remove from sks_free_list */ + list_del_init(&sko->sko_list); + + sks->sks_age = jiffies; + sks->sks_ref++; + skc->skc_obj_alloc++; + + /* Track max obj usage statistics */ + if (skc->skc_obj_alloc > skc->skc_obj_max) + skc->skc_obj_max = skc->skc_obj_alloc; + + /* Track max slab usage statistics */ + if (sks->sks_ref == 1) { + skc->skc_slab_alloc++; + + if (skc->skc_slab_alloc > skc->skc_slab_max) + skc->skc_slab_max = skc->skc_slab_alloc; + } + + return (sko->sko_addr); +} + +/* + * Generic slab allocation function to run by the global work queues. + * It is responsible for allocating a new slab, linking it in to the list + * of partial slabs, and then waking any waiters. + */ +static void +spl_cache_grow_work(void *data) +{ + spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data; + spl_kmem_cache_t *skc = ska->ska_cache; + spl_kmem_slab_t *sks; + + fstrans_cookie_t cookie = spl_fstrans_mark(); + sks = spl_slab_alloc(skc, ska->ska_flags); + spl_fstrans_unmark(cookie); + + spin_lock(&skc->skc_lock); + if (sks) { + skc->skc_slab_total++; + skc->skc_obj_total += sks->sks_objs; + list_add_tail(&sks->sks_list, &skc->skc_partial_list); + } + + atomic_dec(&skc->skc_ref); + smp_mb__before_atomic(); + clear_bit(KMC_BIT_GROWING, &skc->skc_flags); + clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); + smp_mb__after_atomic(); + wake_up_all(&skc->skc_waitq); + spin_unlock(&skc->skc_lock); + + kfree(ska); +} + +/* + * Returns non-zero when a new slab should be available. + */ +static int +spl_cache_grow_wait(spl_kmem_cache_t *skc) +{ + return (!test_bit(KMC_BIT_GROWING, &skc->skc_flags)); +} + +/* + * No available objects on any slabs, create a new slab. Note that this + * functionality is disabled for KMC_SLAB caches which are backed by the + * Linux slab. + */ +static int +spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) +{ + int remaining, rc = 0; + + ASSERT0(flags & ~KM_PUBLIC_MASK); + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT((skc->skc_flags & KMC_SLAB) == 0); + might_sleep(); + *obj = NULL; + + /* + * Before allocating a new slab wait for any reaping to complete and + * then return so the local magazine can be rechecked for new objects. + */ + if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) { + rc = spl_wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING, + TASK_UNINTERRUPTIBLE); + return (rc ? rc : -EAGAIN); + } + + /* + * This is handled by dispatching a work request to the global work + * queue. This allows us to asynchronously allocate a new slab while + * retaining the ability to safely fall back to a smaller synchronous + * allocations to ensure forward progress is always maintained. + */ + if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) { + spl_kmem_alloc_t *ska; + + ska = kmalloc(sizeof (*ska), kmem_flags_convert(flags)); + if (ska == NULL) { + clear_bit_unlock(KMC_BIT_GROWING, &skc->skc_flags); + smp_mb__after_atomic(); + wake_up_all(&skc->skc_waitq); + return (-ENOMEM); + } + + atomic_inc(&skc->skc_ref); + ska->ska_cache = skc; + ska->ska_flags = flags; + taskq_init_ent(&ska->ska_tqe); + taskq_dispatch_ent(spl_kmem_cache_taskq, + spl_cache_grow_work, ska, 0, &ska->ska_tqe); + } + + /* + * The goal here is to only detect the rare case where a virtual slab + * allocation has deadlocked. We must be careful to minimize the use + * of emergency objects which are more expensive to track. Therefore, + * we set a very long timeout for the asynchronous allocation and if + * the timeout is reached the cache is flagged as deadlocked. From + * this point only new emergency objects will be allocated until the + * asynchronous allocation completes and clears the deadlocked flag. + */ + if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) { + rc = spl_emergency_alloc(skc, flags, obj); + } else { + remaining = wait_event_timeout(skc->skc_waitq, + spl_cache_grow_wait(skc), HZ / 10); + + if (!remaining) { + spin_lock(&skc->skc_lock); + if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) { + set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); + skc->skc_obj_deadlock++; + } + spin_unlock(&skc->skc_lock); + } + + rc = -ENOMEM; + } + + return (rc); +} + +/* + * Refill a per-cpu magazine with objects from the slabs for this cache. + * Ideally the magazine can be repopulated using existing objects which have + * been released, however if we are unable to locate enough free objects new + * slabs of objects will be created. On success NULL is returned, otherwise + * the address of a single emergency object is returned for use by the caller. + */ +static void * +spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags) +{ + spl_kmem_slab_t *sks; + int count = 0, rc, refill; + void *obj = NULL; + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(skm->skm_magic == SKM_MAGIC); + + refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail); + spin_lock(&skc->skc_lock); + + while (refill > 0) { + /* No slabs available we may need to grow the cache */ + if (list_empty(&skc->skc_partial_list)) { + spin_unlock(&skc->skc_lock); + + local_irq_enable(); + rc = spl_cache_grow(skc, flags, &obj); + local_irq_disable(); + + /* Emergency object for immediate use by caller */ + if (rc == 0 && obj != NULL) + return (obj); + + if (rc) + goto out; + + /* Rescheduled to different CPU skm is not local */ + if (skm != skc->skc_mag[smp_processor_id()]) + goto out; + + /* + * Potentially rescheduled to the same CPU but + * allocations may have occurred from this CPU while + * we were sleeping so recalculate max refill. + */ + refill = MIN(refill, skm->skm_size - skm->skm_avail); + + spin_lock(&skc->skc_lock); + continue; + } + + /* Grab the next available slab */ + sks = list_entry((&skc->skc_partial_list)->next, + spl_kmem_slab_t, sks_list); + ASSERT(sks->sks_magic == SKS_MAGIC); + ASSERT(sks->sks_ref < sks->sks_objs); + ASSERT(!list_empty(&sks->sks_free_list)); + + /* + * Consume as many objects as needed to refill the requested + * cache. We must also be careful not to overfill it. + */ + while (sks->sks_ref < sks->sks_objs && refill-- > 0 && + ++count) { + ASSERT(skm->skm_avail < skm->skm_size); + ASSERT(count < skm->skm_size); + skm->skm_objs[skm->skm_avail++] = + spl_cache_obj(skc, sks); + } + + /* Move slab to skc_complete_list when full */ + if (sks->sks_ref == sks->sks_objs) { + list_del(&sks->sks_list); + list_add(&sks->sks_list, &skc->skc_complete_list); + } + } + + spin_unlock(&skc->skc_lock); +out: + return (NULL); +} + +/* + * Release an object back to the slab from which it came. + */ +static void +spl_cache_shrink(spl_kmem_cache_t *skc, void *obj) +{ + spl_kmem_slab_t *sks = NULL; + spl_kmem_obj_t *sko = NULL; + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(spin_is_locked(&skc->skc_lock)); + + sko = spl_sko_from_obj(skc, obj); + ASSERT(sko->sko_magic == SKO_MAGIC); + sks = sko->sko_slab; + ASSERT(sks->sks_magic == SKS_MAGIC); + ASSERT(sks->sks_cache == skc); + list_add(&sko->sko_list, &sks->sks_free_list); + + sks->sks_age = jiffies; + sks->sks_ref--; + skc->skc_obj_alloc--; + + /* + * Move slab to skc_partial_list when no longer full. Slabs + * are added to the head to keep the partial list is quasi-full + * sorted order. Fuller at the head, emptier at the tail. + */ + if (sks->sks_ref == (sks->sks_objs - 1)) { + list_del(&sks->sks_list); + list_add(&sks->sks_list, &skc->skc_partial_list); + } + + /* + * Move empty slabs to the end of the partial list so + * they can be easily found and freed during reclamation. + */ + if (sks->sks_ref == 0) { + list_del(&sks->sks_list); + list_add_tail(&sks->sks_list, &skc->skc_partial_list); + skc->skc_slab_alloc--; + } +} + +/* + * Allocate an object from the per-cpu magazine, or if the magazine + * is empty directly allocate from a slab and repopulate the magazine. + */ +void * +spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) +{ + spl_kmem_magazine_t *skm; + void *obj = NULL; + + ASSERT0(flags & ~KM_PUBLIC_MASK); + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); + + /* + * Allocate directly from a Linux slab. All optimizations are left + * to the underlying cache we only need to guarantee that KM_SLEEP + * callers will never fail. + */ + if (skc->skc_flags & KMC_SLAB) { + struct kmem_cache *slc = skc->skc_linux_cache; + do { + obj = kmem_cache_alloc(slc, kmem_flags_convert(flags)); + } while ((obj == NULL) && !(flags & KM_NOSLEEP)); + + goto ret; + } + + local_irq_disable(); + +restart: + /* + * Safe to update per-cpu structure without lock, but + * in the restart case we must be careful to reacquire + * the local magazine since this may have changed + * when we need to grow the cache. + */ + skm = skc->skc_mag[smp_processor_id()]; + ASSERT(skm->skm_magic == SKM_MAGIC); + + if (likely(skm->skm_avail)) { + /* Object available in CPU cache, use it */ + obj = skm->skm_objs[--skm->skm_avail]; + skm->skm_age = jiffies; + } else { + obj = spl_cache_refill(skc, skm, flags); + if ((obj == NULL) && !(flags & KM_NOSLEEP)) + goto restart; + + local_irq_enable(); + goto ret; + } + + local_irq_enable(); + ASSERT(obj); + ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align)); + +ret: + /* Pre-emptively migrate object to CPU L1 cache */ + if (obj) { + if (obj && skc->skc_ctor) + skc->skc_ctor(obj, skc->skc_private, flags); + else + prefetchw(obj); + } + + return (obj); +} +EXPORT_SYMBOL(spl_kmem_cache_alloc); + +/* + * Free an object back to the local per-cpu magazine, there is no + * guarantee that this is the same magazine the object was originally + * allocated from. We may need to flush entire from the magazine + * back to the slabs to make space. + */ +void +spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) +{ + spl_kmem_magazine_t *skm; + unsigned long flags; + int do_reclaim = 0; + int do_emergency = 0; + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); + + /* + * Run the destructor + */ + if (skc->skc_dtor) + skc->skc_dtor(obj, skc->skc_private); + + /* + * Free the object from the Linux underlying Linux slab. + */ + if (skc->skc_flags & KMC_SLAB) { + kmem_cache_free(skc->skc_linux_cache, obj); + return; + } + + /* + * While a cache has outstanding emergency objects all freed objects + * must be checked. However, since emergency objects will never use + * a virtual address these objects can be safely excluded as an + * optimization. + */ + if (!is_vmalloc_addr(obj)) { + spin_lock(&skc->skc_lock); + do_emergency = (skc->skc_obj_emergency > 0); + spin_unlock(&skc->skc_lock); + + if (do_emergency && (spl_emergency_free(skc, obj) == 0)) + return; + } + + local_irq_save(flags); + + /* + * Safe to update per-cpu structure without lock, but + * no remote memory allocation tracking is being performed + * it is entirely possible to allocate an object from one + * CPU cache and return it to another. + */ + skm = skc->skc_mag[smp_processor_id()]; + ASSERT(skm->skm_magic == SKM_MAGIC); + + /* + * Per-CPU cache full, flush it to make space for this object, + * this may result in an empty slab which can be reclaimed once + * interrupts are re-enabled. + */ + if (unlikely(skm->skm_avail >= skm->skm_size)) { + spl_cache_flush(skc, skm, skm->skm_refill); + do_reclaim = 1; + } + + /* Available space in cache, use it */ + skm->skm_objs[skm->skm_avail++] = obj; + + local_irq_restore(flags); + + if (do_reclaim) + spl_slab_reclaim(skc); +} +EXPORT_SYMBOL(spl_kmem_cache_free); + +/* + * The generic shrinker function for all caches. Under Linux a shrinker + * may not be tightly coupled with a slab cache. In fact Linux always + * systematically tries calling all registered shrinker callbacks which + * report that they contain unused objects. Because of this we only + * register one shrinker function in the shim layer for all slab caches. + * We always attempt to shrink all caches when this generic shrinker + * is called. + * + * If sc->nr_to_scan is zero, the caller is requesting a query of the + * number of objects which can potentially be freed. If it is nonzero, + * the request is to free that many objects. + * + * Linux kernels >= 3.12 have the count_objects and scan_objects callbacks + * in struct shrinker and also require the shrinker to return the number + * of objects freed. + * + * Older kernels require the shrinker to return the number of freeable + * objects following the freeing of nr_to_free. + * + * Linux semantics differ from those under Solaris, which are to + * free all available objects which may (and probably will) be more + * objects than the requested nr_to_scan. + */ +static spl_shrinker_t +__spl_kmem_cache_generic_shrinker(struct shrinker *shrink, + struct shrink_control *sc) +{ + spl_kmem_cache_t *skc; + int alloc = 0; + + /* + * No shrinking in a transaction context. Can cause deadlocks. + */ + if (sc->nr_to_scan && spl_fstrans_check()) + return (SHRINK_STOP); + + down_read(&spl_kmem_cache_sem); + list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) { + if (sc->nr_to_scan) { +#ifdef HAVE_SPLIT_SHRINKER_CALLBACK + uint64_t oldalloc = skc->skc_obj_alloc; + spl_kmem_cache_reap_now(skc, + MAX(sc->nr_to_scan>>fls64(skc->skc_slab_objs), 1)); + if (oldalloc > skc->skc_obj_alloc) + alloc += oldalloc - skc->skc_obj_alloc; +#else + spl_kmem_cache_reap_now(skc, + MAX(sc->nr_to_scan>>fls64(skc->skc_slab_objs), 1)); + alloc += skc->skc_obj_alloc; +#endif /* HAVE_SPLIT_SHRINKER_CALLBACK */ + } else { + /* Request to query number of freeable objects */ + alloc += skc->skc_obj_alloc; + } + } + up_read(&spl_kmem_cache_sem); + + /* + * When KMC_RECLAIM_ONCE is set allow only a single reclaim pass. + * This functionality only exists to work around a rare issue where + * shrink_slabs() is repeatedly invoked by many cores causing the + * system to thrash. + */ + if ((spl_kmem_cache_reclaim & KMC_RECLAIM_ONCE) && sc->nr_to_scan) + return (SHRINK_STOP); + + return (MAX(alloc, 0)); +} + +SPL_SHRINKER_CALLBACK_WRAPPER(spl_kmem_cache_generic_shrinker); + +/* + * Call the registered reclaim function for a cache. Depending on how + * many and which objects are released it may simply repopulate the + * local magazine which will then need to age-out. Objects which cannot + * fit in the magazine we will be released back to their slabs which will + * also need to age out before being release. This is all just best + * effort and we do not want to thrash creating and destroying slabs. + */ +void +spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count) +{ + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); + + atomic_inc(&skc->skc_ref); + + /* + * Execute the registered reclaim callback if it exists. + */ + if (skc->skc_flags & KMC_SLAB) { + if (skc->skc_reclaim) + skc->skc_reclaim(skc->skc_private); + goto out; + } + + /* + * Prevent concurrent cache reaping when contended. + */ + if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags)) + goto out; + + /* + * When a reclaim function is available it may be invoked repeatedly + * until at least a single slab can be freed. This ensures that we + * do free memory back to the system. This helps minimize the chance + * of an OOM event when the bulk of memory is used by the slab. + * + * When free slabs are already available the reclaim callback will be + * skipped. Additionally, if no forward progress is detected despite + * a reclaim function the cache will be skipped to avoid deadlock. + * + * Longer term this would be the correct place to add the code which + * repacks the slabs in order minimize fragmentation. + */ + if (skc->skc_reclaim) { + uint64_t objects = UINT64_MAX; + int do_reclaim; + + do { + spin_lock(&skc->skc_lock); + do_reclaim = + (skc->skc_slab_total > 0) && + ((skc->skc_slab_total-skc->skc_slab_alloc) == 0) && + (skc->skc_obj_alloc < objects); + + objects = skc->skc_obj_alloc; + spin_unlock(&skc->skc_lock); + + if (do_reclaim) + skc->skc_reclaim(skc->skc_private); + + } while (do_reclaim); + } + + /* Reclaim from the magazine and free all now empty slabs. */ + if (spl_kmem_cache_expire & KMC_EXPIRE_MEM) { + spl_kmem_magazine_t *skm; + unsigned long irq_flags; + + local_irq_save(irq_flags); + skm = skc->skc_mag[smp_processor_id()]; + spl_cache_flush(skc, skm, skm->skm_avail); + local_irq_restore(irq_flags); + } + + spl_slab_reclaim(skc); + clear_bit_unlock(KMC_BIT_REAPING, &skc->skc_flags); + smp_mb__after_atomic(); + wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING); +out: + atomic_dec(&skc->skc_ref); +} +EXPORT_SYMBOL(spl_kmem_cache_reap_now); + +/* + * Reap all free slabs from all registered caches. + */ +void +spl_kmem_reap(void) +{ + struct shrink_control sc; + + sc.nr_to_scan = KMC_REAP_CHUNK; + sc.gfp_mask = GFP_KERNEL; + + (void) __spl_kmem_cache_generic_shrinker(NULL, &sc); +} +EXPORT_SYMBOL(spl_kmem_reap); + +int +spl_kmem_cache_init(void) +{ + init_rwsem(&spl_kmem_cache_sem); + INIT_LIST_HEAD(&spl_kmem_cache_list); + spl_kmem_cache_taskq = taskq_create("spl_kmem_cache", + spl_kmem_cache_kmem_threads, maxclsyspri, + spl_kmem_cache_kmem_threads * 8, INT_MAX, + TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + spl_register_shrinker(&spl_kmem_cache_shrinker); + + return (0); +} + +void +spl_kmem_cache_fini(void) +{ + spl_unregister_shrinker(&spl_kmem_cache_shrinker); + taskq_destroy(spl_kmem_cache_taskq); +} diff -Naur spl-0.6.5.7/module/spl/spl-kobj.c spl-0.6.5.7.new/module/spl/spl-kobj.c --- spl-0.6.5.7/module/spl/spl-kobj.c 2016-05-13 04:46:56.000000000 +0200 +++ spl-0.6.5.7.new/module/spl/spl-kobj.c 2016-08-01 16:43:32.403791098 +0200 @@ -57,10 +57,15 @@ EXPORT_SYMBOL(kobj_close_file); int -kobj_read_file(struct _buf *file, char *buf, ssize_t size, offset_t off) +kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off) { - return (vn_rdwr(UIO_READ, file->vp, buf, size, off, - UIO_SYSSPACE, 0, RLIM64_INFINITY, 0, NULL)); + ssize_t resid; + + if (vn_rdwr(UIO_READ, file->vp, buf, size, (offset_t)off, + UIO_SYSSPACE, 0, 0, 0, &resid) != 0) + return (-1); + + return (size - resid); } /* kobj_read_file() */ EXPORT_SYMBOL(kobj_read_file); diff -Naur spl-0.6.5.7/module/spl/spl-proc.c spl-0.6.5.7.new/module/spl/spl-proc.c --- spl-0.6.5.7/module/spl/spl-proc.c 2016-05-13 04:46:56.000000000 +0200 +++ spl-0.6.5.7.new/module/spl/spl-proc.c 2016-08-01 16:43:26.107773512 +0200 @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -49,6 +50,8 @@ static struct proc_dir_entry *proc_spl = NULL; static struct proc_dir_entry *proc_spl_kmem = NULL; static struct proc_dir_entry *proc_spl_kmem_slab = NULL; +static struct proc_dir_entry *proc_spl_taskq_all = NULL; +static struct proc_dir_entry *proc_spl_taskq = NULL; struct proc_dir_entry *proc_spl_kstat = NULL; static int @@ -216,6 +219,176 @@ } static void +taskq_seq_show_headers(struct seq_file *f) +{ + seq_printf(f, "%-25s %5s %5s %5s %5s %5s %5s %12s %5s %10s\n", + "taskq", "act", "nthr", "spwn", "maxt", "pri", + "mina", "maxa", "cura", "flags"); +} + +/* indices into the lheads array below */ +#define LHEAD_PEND 0 +#define LHEAD_PRIO 1 +#define LHEAD_DELAY 2 +#define LHEAD_WAIT 3 +#define LHEAD_ACTIVE 4 +#define LHEAD_SIZE 5 + +static int +taskq_seq_show_impl(struct seq_file *f, void *p, boolean_t allflag) +{ + taskq_t *tq = p; + taskq_thread_t *tqt; + wait_queue_t *wq; + struct task_struct *tsk; + taskq_ent_t *tqe; + char name[100]; + struct list_head *lheads[LHEAD_SIZE], *lh; + static char *list_names[LHEAD_SIZE] = + {"pend", "prio", "delay", "wait", "active" }; + int i, j, have_lheads = 0; + unsigned long wflags, flags; + + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + spin_lock_irqsave(&tq->tq_wait_waitq.lock, wflags); + + /* get the various lists and check whether they're empty */ + lheads[LHEAD_PEND] = &tq->tq_pend_list; + lheads[LHEAD_PRIO] = &tq->tq_prio_list; + lheads[LHEAD_DELAY] = &tq->tq_delay_list; + lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.task_list; + lheads[LHEAD_ACTIVE] = &tq->tq_active_list; + + for (i = 0; i < LHEAD_SIZE; ++i) { + if (list_empty(lheads[i])) + lheads[i] = NULL; + else + ++have_lheads; + } + + /* early return in non-"all" mode if lists are all empty */ + if (!allflag && !have_lheads) { + spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags); + spin_unlock_irqrestore(&tq->tq_lock, flags); + return (0); + } + + /* unlock the waitq quickly */ + if (!lheads[LHEAD_WAIT]) + spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags); + + /* show the base taskq contents */ + snprintf(name, sizeof(name), "%s/%d", tq->tq_name, tq->tq_instance); + seq_printf(f, "%-25s ", name); + seq_printf(f, "%5d %5d %5d %5d %5d %5d %12d %5d %10x\n", + tq->tq_nactive, tq->tq_nthreads, tq->tq_nspawn, + tq->tq_maxthreads, tq->tq_pri, tq->tq_minalloc, tq->tq_maxalloc, + tq->tq_nalloc, tq->tq_flags); + + /* show the active list */ + if (lheads[LHEAD_ACTIVE]) { + j = 0; + list_for_each_entry(tqt, &tq->tq_active_list, tqt_active_list) { + if (j == 0) + seq_printf(f, "\t%s:", list_names[LHEAD_ACTIVE]); + else if (j == 2) { + seq_printf(f, "\n\t "); + j = 0; + } + seq_printf(f, " [%d]%pf(%ps)", + tqt->tqt_thread->pid, + tqt->tqt_task->tqent_func, + tqt->tqt_task->tqent_arg); + ++j; + } + seq_printf(f, "\n"); + } + + for (i = LHEAD_PEND; i <= LHEAD_WAIT; ++i) + if (lheads[i]) { + j = 0; + list_for_each(lh, lheads[i]) { + /* show the wait waitq list */ + if (i == LHEAD_WAIT) { + wq = list_entry(lh, wait_queue_t, task_list); + if (j == 0) + seq_printf(f, "\t%s:", + list_names[i]); + else if (j == 12) { + seq_printf(f, "\n\t "); + j = 0; + } + tsk = wq->private; + seq_printf(f, " %d", tsk->pid); + /* pend, prio and delay lists */ + } else { + tqe = list_entry(lh, taskq_ent_t, + tqent_list); + if (j == 0) + seq_printf(f, "\t%s:", + list_names[i]); + else if (j == 2) { + seq_printf(f, "\n\t "); + j = 0; + } + seq_printf(f, " %pf(%ps)", + tqe->tqent_func, + tqe->tqent_arg); + } + ++j; + } + seq_printf(f, "\n"); + } + if (lheads[LHEAD_WAIT]) + spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags); + spin_unlock_irqrestore(&tq->tq_lock, flags); + + return (0); +} + +static int +taskq_all_seq_show(struct seq_file *f, void *p) +{ + return (taskq_seq_show_impl(f, p, B_TRUE)); +} + +static int +taskq_seq_show(struct seq_file *f, void *p) +{ + return (taskq_seq_show_impl(f, p, B_FALSE)); +} + +static void * +taskq_seq_start(struct seq_file *f, loff_t *pos) +{ + struct list_head *p; + loff_t n = *pos; + + down_read(&tq_list_sem); + if (!n) + taskq_seq_show_headers(f); + + p = tq_list.next; + while (n--) { + p = p->next; + if (p == &tq_list) + return (NULL); + } + + return (list_entry(p, taskq_t, tq_taskqs)); +} + +static void * +taskq_seq_next(struct seq_file *f, void *p, loff_t *pos) +{ + taskq_t *tq = p; + + ++*pos; + return ((tq->tq_taskqs.next == &tq_list) ? + NULL : list_entry(tq->tq_taskqs.next, taskq_t, tq_taskqs)); +} + +static void slab_seq_show_headers(struct seq_file *f) { seq_printf(f, @@ -325,6 +498,52 @@ .release = seq_release, }; +static void +taskq_seq_stop(struct seq_file *f, void *v) +{ + up_read(&tq_list_sem); +} + +static struct seq_operations taskq_all_seq_ops = { + .show = taskq_all_seq_show, + .start = taskq_seq_start, + .next = taskq_seq_next, + .stop = taskq_seq_stop, +}; + +static struct seq_operations taskq_seq_ops = { + .show = taskq_seq_show, + .start = taskq_seq_start, + .next = taskq_seq_next, + .stop = taskq_seq_stop, +}; + +static int +proc_taskq_all_open(struct inode *inode, struct file *filp) +{ + return seq_open(filp, &taskq_all_seq_ops); +} + +static int +proc_taskq_open(struct inode *inode, struct file *filp) +{ + return seq_open(filp, &taskq_seq_ops); +} + +static struct file_operations proc_taskq_all_operations = { + .open = proc_taskq_all_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct file_operations proc_taskq_operations = { + .open = proc_taskq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + static struct ctl_table spl_kmem_table[] = { #ifdef DEBUG_KMEM { @@ -476,6 +695,20 @@ goto out; } + proc_spl_taskq_all = proc_create_data("taskq-all", 0444, + proc_spl, &proc_taskq_all_operations, NULL); + if (proc_spl_taskq_all == NULL) { + rc = -EUNATCH; + goto out; + } + + proc_spl_taskq = proc_create_data("taskq", 0444, + proc_spl, &proc_taskq_operations, NULL); + if (proc_spl_taskq == NULL) { + rc = -EUNATCH; + goto out; + } + proc_spl_kmem = proc_mkdir("kmem", proc_spl); if (proc_spl_kmem == NULL) { rc = -EUNATCH; @@ -499,6 +732,8 @@ remove_proc_entry("kstat", proc_spl); remove_proc_entry("slab", proc_spl_kmem); remove_proc_entry("kmem", proc_spl); + remove_proc_entry("taskq-all", proc_spl); + remove_proc_entry("taskq", proc_spl); remove_proc_entry("spl", NULL); unregister_sysctl_table(spl_header); } @@ -512,6 +747,8 @@ remove_proc_entry("kstat", proc_spl); remove_proc_entry("slab", proc_spl_kmem); remove_proc_entry("kmem", proc_spl); + remove_proc_entry("taskq-all", proc_spl); + remove_proc_entry("taskq", proc_spl); remove_proc_entry("spl", NULL); ASSERT(spl_header != NULL); diff -Naur spl-0.6.5.7/module/spl/spl-rwlock.c spl-0.6.5.7.new/module/spl/spl-rwlock.c --- spl-0.6.5.7/module/spl/spl-rwlock.c 2016-05-13 04:46:56.000000000 +0200 +++ spl-0.6.5.7.new/module/spl/spl-rwlock.c 2016-08-01 16:43:34.281796344 +0200 @@ -32,65 +32,55 @@ #define DEBUG_SUBSYSTEM S_RWLOCK -#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK - -/* - * From lib/rwsem-spinlock.c but modified such that the caller is - * responsible for acquiring and dropping the sem->wait_lock. - */ -struct rwsem_waiter { - struct list_head list; - struct task_struct *task; - unsigned int flags; -#define RWSEM_WAITING_FOR_READ 0x00000001 -#define RWSEM_WAITING_FOR_WRITE 0x00000002 -}; - -/* wake a single writer */ -static struct rw_semaphore * -__rwsem_wake_one_writer_locked(struct rw_semaphore *sem) +#if defined(CONFIG_RWSEM_GENERIC_SPINLOCK) +static int +__rwsem_tryupgrade(struct rw_semaphore *rwsem) { - struct rwsem_waiter *waiter; - struct task_struct *tsk; - - sem->activity = -1; - - waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - list_del(&waiter->list); - - tsk = waiter->task; - smp_mb(); - waiter->task = NULL; - wake_up_process(tsk); - put_task_struct(tsk); - return sem; + int ret = 0; + unsigned long flags; + spl_rwsem_lock_irqsave(&rwsem->wait_lock, flags); + if (RWSEM_COUNT(rwsem) == SPL_RWSEM_SINGLE_READER_VALUE && + list_empty(&rwsem->wait_list)) { + ret = 1; + RWSEM_COUNT(rwsem) = SPL_RWSEM_SINGLE_WRITER_VALUE; + } + spl_rwsem_unlock_irqrestore(&rwsem->wait_lock, flags); + return (ret); } - -/* release a read lock on the semaphore */ -void -__up_read_locked(struct rw_semaphore *sem) +#elif defined(HAVE_RWSEM_ATOMIC_LONG_COUNT) +static int +__rwsem_tryupgrade(struct rw_semaphore *rwsem) { - if (--sem->activity == 0 && !list_empty(&sem->wait_list)) - (void)__rwsem_wake_one_writer_locked(sem); + long val; + val = atomic_long_cmpxchg(&rwsem->count, SPL_RWSEM_SINGLE_READER_VALUE, + SPL_RWSEM_SINGLE_WRITER_VALUE); + return (val == SPL_RWSEM_SINGLE_READER_VALUE); } -EXPORT_SYMBOL(__up_read_locked); - -/* trylock for writing -- returns 1 if successful, 0 if contention */ -int -__down_write_trylock_locked(struct rw_semaphore *sem) +#else +static int +__rwsem_tryupgrade(struct rw_semaphore *rwsem) { - int ret = 0; - - if (sem->activity == 0 && list_empty(&sem->wait_list)) { - sem->activity = -1; - ret = 1; - } - - return ret; + typeof (rwsem->count) val; + val = cmpxchg(&rwsem->count, SPL_RWSEM_SINGLE_READER_VALUE, + SPL_RWSEM_SINGLE_WRITER_VALUE); + return (val == SPL_RWSEM_SINGLE_READER_VALUE); } -EXPORT_SYMBOL(__down_write_trylock_locked); +#endif +int +rwsem_tryupgrade(struct rw_semaphore *rwsem) +{ + if (__rwsem_tryupgrade(rwsem)) { + rwsem_release(&rwsem->dep_map, 1, _RET_IP_); + rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_); +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER + rwsem->owner = current; #endif + return (1); + } + return (0); +} +EXPORT_SYMBOL(rwsem_tryupgrade); int spl_rw_init(void) { return 0; } void spl_rw_fini(void) { } diff -Naur spl-0.6.5.7/module/spl/spl-taskq.c spl-0.6.5.7.new/module/spl/spl-taskq.c --- spl-0.6.5.7/module/spl/spl-taskq.c 2016-05-13 04:46:57.000000000 +0200 +++ spl-0.6.5.7.new/module/spl/spl-taskq.c 2016-08-01 16:43:34.279796338 +0200 @@ -1,4 +1,4 @@ -/*****************************************************************************\ +/* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -20,12 +20,13 @@ * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . - ***************************************************************************** + * * Solaris Porting Layer (SPL) Task Queue Implementation. -\*****************************************************************************/ + */ #include #include +#include int spl_taskq_thread_bind = 0; module_param(spl_taskq_thread_bind, int, 0644); @@ -39,12 +40,12 @@ int spl_taskq_thread_priority = 1; module_param(spl_taskq_thread_priority, int, 0644); MODULE_PARM_DESC(spl_taskq_thread_priority, - "Allow non-default priority for taskq threads"); + "Allow non-default priority for taskq threads"); int spl_taskq_thread_sequential = 4; module_param(spl_taskq_thread_sequential, int, 0644); MODULE_PARM_DESC(spl_taskq_thread_sequential, - "Create new taskq threads after N sequential tasks"); + "Create new taskq threads after N sequential tasks"); /* Global system-wide dynamic task queue available for all consumers */ taskq_t *system_taskq; @@ -54,16 +55,38 @@ static taskq_t *dynamic_taskq; static taskq_thread_t *taskq_thread_create(taskq_t *); +/* List of all taskqs */ +LIST_HEAD(tq_list); +DECLARE_RWSEM(tq_list_sem); +static uint_t taskq_tsd; + static int task_km_flags(uint_t flags) { if (flags & TQ_NOSLEEP) - return KM_NOSLEEP; + return (KM_NOSLEEP); if (flags & TQ_PUSHPAGE) - return KM_PUSHPAGE; + return (KM_PUSHPAGE); + + return (KM_SLEEP); +} + +/* + * taskq_find_by_name - Find the largest instance number of a named taskq. + */ +static int +taskq_find_by_name(const char *name) +{ + struct list_head *tql; + taskq_t *tq; - return KM_SLEEP; + list_for_each_prev(tql, &tq_list) { + tq = list_entry(tql, taskq_t, tq_taskqs); + if (strcmp(name, tq->tq_name) == 0) + return tq->tq_instance; + } + return (-1); } /* @@ -71,7 +94,7 @@ * is not attached to the free, work, or pending taskq lists. */ static taskq_ent_t * -task_alloc(taskq_t *tq, uint_t flags) +task_alloc(taskq_t *tq, uint_t flags, unsigned long *irqflags) { taskq_ent_t *t; int count = 0; @@ -111,18 +134,19 @@ * end up delaying the task allocation by one second, thereby * throttling the task dispatch rate. */ - spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + spin_unlock_irqrestore(&tq->tq_lock, *irqflags); schedule_timeout(HZ / 100); - spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + spin_lock_irqsave_nested(&tq->tq_lock, *irqflags, + tq->tq_lock_class); if (count < 100) { count++; goto retry; } } - spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); - t = kmem_alloc(sizeof(taskq_ent_t), task_km_flags(flags)); - spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + spin_unlock_irqrestore(&tq->tq_lock, *irqflags); + t = kmem_alloc(sizeof (taskq_ent_t), task_km_flags(flags)); + spin_lock_irqsave_nested(&tq->tq_lock, *irqflags, tq->tq_lock_class); if (t) { taskq_init_ent(t); @@ -145,7 +169,7 @@ ASSERT(list_empty(&t->tqent_list)); ASSERT(!timer_pending(&t->tqent_timer)); - kmem_free(t, sizeof(taskq_ent_t)); + kmem_free(t, sizeof (taskq_ent_t)); tq->tq_nalloc--; } @@ -187,15 +211,17 @@ taskq_ent_t *w, *t = (taskq_ent_t *)data; taskq_t *tq = t->tqent_taskq; struct list_head *l; + unsigned long flags; - spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); if (t->tqent_flags & TQENT_FLAG_CANCEL) { ASSERT(list_empty(&t->tqent_list)); - spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + spin_unlock_irqrestore(&tq->tq_lock, flags); return; } + t->tqent_birth = jiffies; /* * The priority list must be maintained in strict task id order * from lowest to highest for lowest_id to be easily calculable. @@ -211,7 +237,7 @@ if (l == &tq->tq_prio_list) list_add(&t->tqent_list, &tq->tq_prio_list); - spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + spin_unlock_irqrestore(&tq->tq_lock, flags); wake_up(&tq->tq_work_waitq); } @@ -378,10 +404,11 @@ { int active = 0; int rc; + unsigned long flags; - spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); rc = (taskq_find(tq, id, &active) == NULL); - spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + spin_unlock_irqrestore(&tq->tq_lock, flags); return (rc); } @@ -401,10 +428,11 @@ taskq_wait_outstanding_check(taskq_t *tq, taskqid_t id) { int rc; + unsigned long flags; - spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); rc = (id < tq->tq_lowest_id); - spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + spin_unlock_irqrestore(&tq->tq_lock, flags); return (rc); } @@ -419,8 +447,8 @@ void taskq_wait_outstanding(taskq_t *tq, taskqid_t id) { - wait_event(tq->tq_wait_waitq, - taskq_wait_outstanding_check(tq, id ? id : tq->tq_next_id - 1)); + id = id ? id : tq->tq_next_id - 1; + wait_event(tq->tq_wait_waitq, taskq_wait_outstanding_check(tq, id)); } EXPORT_SYMBOL(taskq_wait_outstanding); @@ -428,10 +456,11 @@ taskq_wait_check(taskq_t *tq) { int rc; + unsigned long flags; - spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); rc = (tq->tq_lowest_id == tq->tq_next_id); - spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + spin_unlock_irqrestore(&tq->tq_lock, flags); return (rc); } @@ -448,37 +477,10 @@ } EXPORT_SYMBOL(taskq_wait); -static int -taskq_member_impl(taskq_t *tq, void *t) -{ - struct list_head *l; - taskq_thread_t *tqt; - int found = 0; - - ASSERT(tq); - ASSERT(t); - ASSERT(spin_is_locked(&tq->tq_lock)); - - list_for_each(l, &tq->tq_thread_list) { - tqt = list_entry(l, taskq_thread_t, tqt_thread_list); - if (tqt->tqt_thread == (struct task_struct *)t) { - found = 1; - break; - } - } - return (found); -} - int -taskq_member(taskq_t *tq, void *t) +taskq_member(taskq_t *tq, kthread_t *t) { - int found; - - spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); - found = taskq_member_impl(tq, t); - spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); - - return (found); + return (tq == (taskq_t *)tsd_get_by_thread(taskq_tsd, t)); } EXPORT_SYMBOL(taskq_member); @@ -494,10 +496,11 @@ taskq_ent_t *t; int active = 0; int rc = ENOENT; + unsigned long flags; ASSERT(tq); - spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); t = taskq_find(tq, id, &active); if (t && !active) { list_del_init(&t->tqent_list); @@ -517,9 +520,10 @@ * drop the lock before synchronously cancelling the timer. */ if (timer_pending(&t->tqent_timer)) { - spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + spin_unlock_irqrestore(&tq->tq_lock, flags); del_timer_sync(&t->tqent_timer); - spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + spin_lock_irqsave_nested(&tq->tq_lock, flags, + tq->tq_lock_class); } if (!(t->tqent_flags & TQENT_FLAG_PREALLOC)) @@ -527,7 +531,7 @@ rc = 0; } - spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + spin_unlock_irqrestore(&tq->tq_lock, flags); if (active) { taskq_wait_id(tq, id); @@ -545,11 +549,12 @@ { taskq_ent_t *t; taskqid_t rc = 0; + unsigned long irqflags; ASSERT(tq); ASSERT(func); - spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class); /* Taskq being destroyed and all tasks drained */ if (!(tq->tq_flags & TASKQ_ACTIVE)) @@ -557,16 +562,22 @@ /* Do not queue the task unless there is idle thread for it */ ASSERT(tq->tq_nactive <= tq->tq_nthreads); - if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) - goto out; + if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) { + /* Dynamic taskq may be able to spawn another thread */ + if (!(tq->tq_flags & TASKQ_DYNAMIC) || taskq_thread_spawn(tq) == 0) + goto out; + } - if ((t = task_alloc(tq, flags)) == NULL) + if ((t = task_alloc(tq, flags, &irqflags)) == NULL) goto out; spin_lock(&t->tqent_lock); + /* Queue to the front of the list to enforce TQ_NOQUEUE semantics */ + if (flags & TQ_NOQUEUE) + list_add(&t->tqent_list, &tq->tq_prio_list); /* Queue to the priority list instead of the pending list */ - if (flags & TQ_FRONT) + else if (flags & TQ_FRONT) list_add_tail(&t->tqent_list, &tq->tq_prio_list); else list_add_tail(&t->tqent_list, &tq->tq_pend_list); @@ -579,6 +590,7 @@ t->tqent_timer.data = 0; t->tqent_timer.function = NULL; t->tqent_timer.expires = 0; + t->tqent_birth = jiffies; ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); @@ -587,10 +599,10 @@ wake_up(&tq->tq_work_waitq); out: /* Spawn additional taskq threads if required. */ - if (tq->tq_nactive == tq->tq_nthreads) + if (!(flags & TQ_NOQUEUE) && tq->tq_nactive == tq->tq_nthreads) (void) taskq_thread_spawn(tq); - spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + spin_unlock_irqrestore(&tq->tq_lock, irqflags); return (rc); } EXPORT_SYMBOL(taskq_dispatch); @@ -601,17 +613,18 @@ { taskqid_t rc = 0; taskq_ent_t *t; + unsigned long irqflags; ASSERT(tq); ASSERT(func); - spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class); /* Taskq being destroyed and all tasks drained */ if (!(tq->tq_flags & TASKQ_ACTIVE)) goto out; - if ((t = task_alloc(tq, flags)) == NULL) + if ((t = task_alloc(tq, flags, &irqflags)) == NULL) goto out; spin_lock(&t->tqent_lock); @@ -636,19 +649,21 @@ /* Spawn additional taskq threads if required. */ if (tq->tq_nactive == tq->tq_nthreads) (void) taskq_thread_spawn(tq); - spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + spin_unlock_irqrestore(&tq->tq_lock, irqflags); return (rc); } EXPORT_SYMBOL(taskq_dispatch_delay); void taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, - taskq_ent_t *t) + taskq_ent_t *t) { + unsigned long irqflags; ASSERT(tq); ASSERT(func); - spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + spin_lock_irqsave_nested(&tq->tq_lock, irqflags, + tq->tq_lock_class); /* Taskq being destroyed and all tasks drained */ if (!(tq->tq_flags & TASKQ_ACTIVE)) { @@ -656,6 +671,13 @@ goto out; } + if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) { + /* Dynamic taskq may be able to spawn another thread */ + if (!(tq->tq_flags & TASKQ_DYNAMIC) || taskq_thread_spawn(tq) == 0) + goto out2; + flags |= TQ_FRONT; + } + spin_lock(&t->tqent_lock); /* @@ -675,6 +697,7 @@ t->tqent_func = func; t->tqent_arg = arg; t->tqent_taskq = tq; + t->tqent_birth = jiffies; spin_unlock(&t->tqent_lock); @@ -683,14 +706,15 @@ /* Spawn additional taskq threads if required. */ if (tq->tq_nactive == tq->tq_nthreads) (void) taskq_thread_spawn(tq); - spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); +out2: + spin_unlock_irqrestore(&tq->tq_lock, irqflags); } EXPORT_SYMBOL(taskq_dispatch_ent); int taskq_empty_ent(taskq_ent_t *t) { - return list_empty(&t->tqent_list); + return (list_empty(&t->tqent_list)); } EXPORT_SYMBOL(taskq_empty_ent); @@ -737,16 +761,18 @@ taskq_thread_spawn_task(void *arg) { taskq_t *tq = (taskq_t *)arg; + unsigned long flags; - (void) taskq_thread_create(tq); - - spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); - tq->tq_nspawn--; - spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + if (taskq_thread_create(tq) == NULL) { + /* restore spawning count if failed */ + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + tq->tq_nspawn--; + spin_unlock_irqrestore(&tq->tq_lock, flags); + } } /* - * Spawn addition threads for dynamic taskqs (TASKQ_DYNMAIC) the current + * Spawn addition threads for dynamic taskqs (TASKQ_DYNAMIC) the current * number of threads is insufficient to handle the pending tasks. These * new threads must be created by the dedicated dynamic_taskq to avoid * deadlocks between thread creation and memory reclaim. The system_taskq @@ -796,7 +822,7 @@ (tq->tq_nactive == 0) && /* No threads are handling tasks */ (tq->tq_nthreads > 1) && /* More than 1 thread is running */ (!taskq_next_ent(tq)) && /* There are no pending tasks */ - (spl_taskq_thread_dynamic));/* Dynamic taskqs are allowed */ + (spl_taskq_thread_dynamic)); /* Dynamic taskqs are allowed */ } static int @@ -808,8 +834,10 @@ taskq_t *tq; taskq_ent_t *t; int seq_tasks = 0; + unsigned long flags; ASSERT(tqt); + ASSERT(tqt->tqt_tq); tq = tqt->tqt_tq; current->flags |= PF_NOFREEZE; @@ -819,7 +847,16 @@ sigprocmask(SIG_BLOCK, &blocked, NULL); flush_signals(current); - spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + tsd_set(taskq_tsd, tq); + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + /* + * If we are dynamically spawned, decrease spawning count. Note that + * we could be created during taskq_create, in which case we shouldn't + * do the decrement. But it's fine because taskq_create will reset + * tq_nspawn later. + */ + if (tq->tq_flags & TASKQ_DYNAMIC) + tq->tq_nspawn--; /* Immediately exit if more threads than allowed were created. */ if (tq->tq_nthreads >= tq->tq_maxthreads) @@ -841,12 +878,13 @@ } add_wait_queue_exclusive(&tq->tq_work_waitq, &wait); - spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + spin_unlock_irqrestore(&tq->tq_lock, flags); schedule(); seq_tasks = 0; - spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + spin_lock_irqsave_nested(&tq->tq_lock, flags, + tq->tq_lock_class); remove_wait_queue(&tq->tq_work_waitq, &wait); } else { __set_current_state(TASK_RUNNING); @@ -855,27 +893,32 @@ if ((t = taskq_next_ent(tq)) != NULL) { list_del_init(&t->tqent_list); - /* In order to support recursively dispatching a + /* + * In order to support recursively dispatching a * preallocated taskq_ent_t, tqent_id must be - * stored prior to executing tqent_func. */ + * stored prior to executing tqent_func. + */ tqt->tqt_id = t->tqent_id; tqt->tqt_task = t; - /* We must store a copy of the flags prior to + /* + * We must store a copy of the flags prior to * servicing the task (servicing a prealloc'd task * returns the ownership of the tqent back to * the caller of taskq_dispatch). Thus, - * tqent_flags _may_ change within the call. */ + * tqent_flags _may_ change within the call. + */ tqt->tqt_flags = t->tqent_flags; taskq_insert_in_order(tq, tqt); tq->tq_nactive++; - spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + spin_unlock_irqrestore(&tq->tq_lock, flags); /* Perform the requested task */ t->tqent_func(t->tqent_arg); - spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + spin_lock_irqsave_nested(&tq->tq_lock, flags, + tq->tq_lock_class); tq->tq_nactive--; list_del_init(&tqt->tqt_active_list); tqt->tqt_task = NULL; @@ -884,8 +927,10 @@ if (!(tqt->tqt_flags & TQENT_FLAG_PREALLOC)) task_done(tq, t); - /* When the current lowest outstanding taskqid is - * done calculate the new lowest outstanding id */ + /* + * When the current lowest outstanding taskqid is + * done calculate the new lowest outstanding id + */ if (tq->tq_lowest_id == tqt->tqt_id) { tq->tq_lowest_id = taskq_lowest_id(tq); ASSERT3S(tq->tq_lowest_id, >, tqt->tqt_id); @@ -913,7 +958,9 @@ list_del_init(&tqt->tqt_thread_list); error: kmem_free(tqt, sizeof (taskq_thread_t)); - spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + spin_unlock_irqrestore(&tq->tq_lock, flags); + + tsd_set(taskq_tsd, NULL); return (0); } @@ -957,6 +1004,7 @@ taskq_t *tq; taskq_thread_t *tqt; int count = 0, rc = 0, i; + unsigned long irqflags; ASSERT(name != NULL); ASSERT(minalloc >= 0); @@ -979,32 +1027,36 @@ spin_lock_init(&tq->tq_lock); INIT_LIST_HEAD(&tq->tq_thread_list); INIT_LIST_HEAD(&tq->tq_active_list); - tq->tq_name = strdup(name); - tq->tq_nactive = 0; - tq->tq_nthreads = 0; - tq->tq_nspawn = 0; + tq->tq_name = strdup(name); + tq->tq_nactive = 0; + tq->tq_nthreads = 0; + tq->tq_nspawn = 0; tq->tq_maxthreads = nthreads; - tq->tq_pri = pri; - tq->tq_minalloc = minalloc; - tq->tq_maxalloc = maxalloc; - tq->tq_nalloc = 0; - tq->tq_flags = (flags | TASKQ_ACTIVE); - tq->tq_next_id = 1; - tq->tq_lowest_id = 1; + tq->tq_pri = pri; + tq->tq_minalloc = minalloc; + tq->tq_maxalloc = maxalloc; + tq->tq_nalloc = 0; + tq->tq_flags = (flags | TASKQ_ACTIVE); + tq->tq_next_id = 1; + tq->tq_lowest_id = 1; INIT_LIST_HEAD(&tq->tq_free_list); INIT_LIST_HEAD(&tq->tq_pend_list); INIT_LIST_HEAD(&tq->tq_prio_list); INIT_LIST_HEAD(&tq->tq_delay_list); init_waitqueue_head(&tq->tq_work_waitq); init_waitqueue_head(&tq->tq_wait_waitq); + tq->tq_lock_class = TQ_LOCK_GENERAL; + INIT_LIST_HEAD(&tq->tq_taskqs); if (flags & TASKQ_PREPOPULATE) { - spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + spin_lock_irqsave_nested(&tq->tq_lock, irqflags, + tq->tq_lock_class); for (i = 0; i < minalloc; i++) - task_done(tq, task_alloc(tq, TQ_PUSHPAGE | TQ_NEW)); + task_done(tq, task_alloc(tq, TQ_PUSHPAGE | TQ_NEW, + &irqflags)); - spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + spin_unlock_irqrestore(&tq->tq_lock, irqflags); } if ((flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) @@ -1020,10 +1072,20 @@ /* Wait for all threads to be started before potential destroy */ wait_event(tq->tq_wait_waitq, tq->tq_nthreads == count); + /* + * taskq_thread might have touched nspawn, but we don't want them to + * because they're not dynamically spawned. So we reset it to 0 + */ + tq->tq_nspawn = 0; if (rc) { taskq_destroy(tq); tq = NULL; + } else { + down_write(&tq_list_sem); + tq->tq_instance = taskq_find_by_name(name) + 1; + list_add_tail(&tq->tq_taskqs, &tq_list); + up_write(&tq_list_sem); } return (tq); @@ -1036,11 +1098,12 @@ struct task_struct *thread; taskq_thread_t *tqt; taskq_ent_t *t; + unsigned long flags; ASSERT(tq); - spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); tq->tq_flags &= ~TASKQ_ACTIVE; - spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + spin_unlock_irqrestore(&tq->tq_lock, flags); /* * When TASKQ_ACTIVE is clear new tasks may not be added nor may @@ -1051,7 +1114,18 @@ taskq_wait(tq); - spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + /* remove taskq from global list used by the kstats */ + down_write(&tq_list_sem); + list_del(&tq->tq_taskqs); + up_write(&tq_list_sem); + + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + /* wait for spawning threads to insert themselves to the list */ + while (tq->tq_nspawn) { + spin_unlock_irqrestore(&tq->tq_lock, flags); + schedule_timeout_interruptible(1); + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + } /* * Signal each thread to exit and block until it does. Each thread @@ -1063,11 +1137,12 @@ tqt = list_entry(tq->tq_thread_list.next, taskq_thread_t, tqt_thread_list); thread = tqt->tqt_thread; - spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + spin_unlock_irqrestore(&tq->tq_lock, flags); kthread_stop(thread); - spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + spin_lock_irqsave_nested(&tq->tq_lock, flags, + tq->tq_lock_class); } while (!list_empty(&tq->tq_free_list)) { @@ -1089,16 +1164,75 @@ ASSERT(list_empty(&tq->tq_prio_list)); ASSERT(list_empty(&tq->tq_delay_list)); - spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + spin_unlock_irqrestore(&tq->tq_lock, flags); strfree(tq->tq_name); kmem_free(tq, sizeof (taskq_t)); } EXPORT_SYMBOL(taskq_destroy); + +static unsigned int spl_taskq_kick = 0; + +/* + * 2.6.36 API Change + * module_param_cb is introduced to take kernel_param_ops and + * module_param_call is marked as obsolete. Also set and get operations + * were changed to take a 'const struct kernel_param *'. + */ +static int +#ifdef module_param_cb +param_set_taskq_kick(const char *val, const struct kernel_param *kp) +#else +param_set_taskq_kick(const char *val, struct kernel_param *kp) +#endif +{ + int ret; + taskq_t *tq; + taskq_ent_t *t; + unsigned long flags; + + ret = param_set_uint(val, kp); + if (ret < 0 || !spl_taskq_kick) + return (ret); + /* reset value */ + spl_taskq_kick = 0; + + down_read(&tq_list_sem); + list_for_each_entry(tq, &tq_list, tq_taskqs) { + spin_lock_irqsave_nested(&tq->tq_lock, flags, + tq->tq_lock_class); + /* Check if the first pending is older than 5 seconds */ + t = taskq_next_ent(tq); + if (t && time_after(jiffies, t->tqent_birth + 5*HZ)) { + (void) taskq_thread_spawn(tq); + printk(KERN_INFO "spl: Kicked taskq %s/%d\n", + tq->tq_name, tq->tq_instance); + } + spin_unlock_irqrestore(&tq->tq_lock, flags); + } + up_read(&tq_list_sem); + return (ret); +} + +#ifdef module_param_cb +static const struct kernel_param_ops param_ops_taskq_kick = { + .set = param_set_taskq_kick, + .get = param_get_uint, +}; +module_param_cb(spl_taskq_kick, ¶m_ops_taskq_kick, &spl_taskq_kick, 0644); +#else +module_param_call(spl_taskq_kick, param_set_taskq_kick, param_get_uint, + &spl_taskq_kick, 0644); +#endif +MODULE_PARM_DESC(spl_taskq_kick, + "Write nonzero to kick stuck taskqs to spawn more threads"); + int spl_taskq_init(void) { + tsd_create(&taskq_tsd, NULL); + system_taskq = taskq_create("spl_system_taskq", MAX(boot_ncpus, 64), maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); if (system_taskq == NULL) @@ -1111,6 +1245,13 @@ return (1); } + /* + * This is used to annotate tq_lock, so + * taskq_dispatch -> taskq_thread_spawn -> taskq_dispatch + * does not trigger a lockdep warning re: possible recursive locking + */ + dynamic_taskq->tq_lock_class = TQ_LOCK_DYNAMIC; + return (0); } @@ -1122,4 +1263,6 @@ taskq_destroy(system_taskq); system_taskq = NULL; + + tsd_destroy(&taskq_tsd); } diff -Naur spl-0.6.5.7/module/spl/spl-taskq.c.orig spl-0.6.5.7.new/module/spl/spl-taskq.c.orig --- spl-0.6.5.7/module/spl/spl-taskq.c.orig 1970-01-01 01:00:00.000000000 +0100 +++ spl-0.6.5.7.new/module/spl/spl-taskq.c.orig 2016-05-13 04:46:57.000000000 +0200 @@ -0,0 +1,1125 @@ +/*****************************************************************************\ + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + ***************************************************************************** + * Solaris Porting Layer (SPL) Task Queue Implementation. +\*****************************************************************************/ + +#include +#include + +int spl_taskq_thread_bind = 0; +module_param(spl_taskq_thread_bind, int, 0644); +MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default"); + + +int spl_taskq_thread_dynamic = 0; +module_param(spl_taskq_thread_dynamic, int, 0644); +MODULE_PARM_DESC(spl_taskq_thread_dynamic, "Allow dynamic taskq threads"); + +int spl_taskq_thread_priority = 1; +module_param(spl_taskq_thread_priority, int, 0644); +MODULE_PARM_DESC(spl_taskq_thread_priority, + "Allow non-default priority for taskq threads"); + +int spl_taskq_thread_sequential = 4; +module_param(spl_taskq_thread_sequential, int, 0644); +MODULE_PARM_DESC(spl_taskq_thread_sequential, + "Create new taskq threads after N sequential tasks"); + +/* Global system-wide dynamic task queue available for all consumers */ +taskq_t *system_taskq; +EXPORT_SYMBOL(system_taskq); + +/* Private dedicated taskq for creating new taskq threads on demand. */ +static taskq_t *dynamic_taskq; +static taskq_thread_t *taskq_thread_create(taskq_t *); + +static int +task_km_flags(uint_t flags) +{ + if (flags & TQ_NOSLEEP) + return KM_NOSLEEP; + + if (flags & TQ_PUSHPAGE) + return KM_PUSHPAGE; + + return KM_SLEEP; +} + +/* + * NOTE: Must be called with tq->tq_lock held, returns a list_t which + * is not attached to the free, work, or pending taskq lists. + */ +static taskq_ent_t * +task_alloc(taskq_t *tq, uint_t flags) +{ + taskq_ent_t *t; + int count = 0; + + ASSERT(tq); + ASSERT(spin_is_locked(&tq->tq_lock)); +retry: + /* Acquire taskq_ent_t's from free list if available */ + if (!list_empty(&tq->tq_free_list) && !(flags & TQ_NEW)) { + t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list); + + ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); + ASSERT(!(t->tqent_flags & TQENT_FLAG_CANCEL)); + ASSERT(!timer_pending(&t->tqent_timer)); + + list_del_init(&t->tqent_list); + return (t); + } + + /* Free list is empty and memory allocations are prohibited */ + if (flags & TQ_NOALLOC) + return (NULL); + + /* Hit maximum taskq_ent_t pool size */ + if (tq->tq_nalloc >= tq->tq_maxalloc) { + if (flags & TQ_NOSLEEP) + return (NULL); + + /* + * Sleep periodically polling the free list for an available + * taskq_ent_t. Dispatching with TQ_SLEEP should always succeed + * but we cannot block forever waiting for an taskq_ent_t to + * show up in the free list, otherwise a deadlock can happen. + * + * Therefore, we need to allocate a new task even if the number + * of allocated tasks is above tq->tq_maxalloc, but we still + * end up delaying the task allocation by one second, thereby + * throttling the task dispatch rate. + */ + spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + schedule_timeout(HZ / 100); + spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + if (count < 100) { + count++; + goto retry; + } + } + + spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + t = kmem_alloc(sizeof(taskq_ent_t), task_km_flags(flags)); + spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + + if (t) { + taskq_init_ent(t); + tq->tq_nalloc++; + } + + return (t); +} + +/* + * NOTE: Must be called with tq->tq_lock held, expects the taskq_ent_t + * to already be removed from the free, work, or pending taskq lists. + */ +static void +task_free(taskq_t *tq, taskq_ent_t *t) +{ + ASSERT(tq); + ASSERT(t); + ASSERT(spin_is_locked(&tq->tq_lock)); + ASSERT(list_empty(&t->tqent_list)); + ASSERT(!timer_pending(&t->tqent_timer)); + + kmem_free(t, sizeof(taskq_ent_t)); + tq->tq_nalloc--; +} + +/* + * NOTE: Must be called with tq->tq_lock held, either destroys the + * taskq_ent_t if too many exist or moves it to the free list for later use. + */ +static void +task_done(taskq_t *tq, taskq_ent_t *t) +{ + ASSERT(tq); + ASSERT(t); + ASSERT(spin_is_locked(&tq->tq_lock)); + + /* Wake tasks blocked in taskq_wait_id() */ + wake_up_all(&t->tqent_waitq); + + list_del_init(&t->tqent_list); + + if (tq->tq_nalloc <= tq->tq_minalloc) { + t->tqent_id = 0; + t->tqent_func = NULL; + t->tqent_arg = NULL; + t->tqent_flags = 0; + + list_add_tail(&t->tqent_list, &tq->tq_free_list); + } else { + task_free(tq, t); + } +} + +/* + * When a delayed task timer expires remove it from the delay list and + * add it to the priority list in order for immediate processing. + */ +static void +task_expire(unsigned long data) +{ + taskq_ent_t *w, *t = (taskq_ent_t *)data; + taskq_t *tq = t->tqent_taskq; + struct list_head *l; + + spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + + if (t->tqent_flags & TQENT_FLAG_CANCEL) { + ASSERT(list_empty(&t->tqent_list)); + spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + return; + } + + /* + * The priority list must be maintained in strict task id order + * from lowest to highest for lowest_id to be easily calculable. + */ + list_del(&t->tqent_list); + list_for_each_prev(l, &tq->tq_prio_list) { + w = list_entry(l, taskq_ent_t, tqent_list); + if (w->tqent_id < t->tqent_id) { + list_add(&t->tqent_list, l); + break; + } + } + if (l == &tq->tq_prio_list) + list_add(&t->tqent_list, &tq->tq_prio_list); + + spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + + wake_up(&tq->tq_work_waitq); +} + +/* + * Returns the lowest incomplete taskqid_t. The taskqid_t may + * be queued on the pending list, on the priority list, on the + * delay list, or on the work list currently being handled, but + * it is not 100% complete yet. + */ +static taskqid_t +taskq_lowest_id(taskq_t *tq) +{ + taskqid_t lowest_id = tq->tq_next_id; + taskq_ent_t *t; + taskq_thread_t *tqt; + + ASSERT(tq); + ASSERT(spin_is_locked(&tq->tq_lock)); + + if (!list_empty(&tq->tq_pend_list)) { + t = list_entry(tq->tq_pend_list.next, taskq_ent_t, tqent_list); + lowest_id = MIN(lowest_id, t->tqent_id); + } + + if (!list_empty(&tq->tq_prio_list)) { + t = list_entry(tq->tq_prio_list.next, taskq_ent_t, tqent_list); + lowest_id = MIN(lowest_id, t->tqent_id); + } + + if (!list_empty(&tq->tq_delay_list)) { + t = list_entry(tq->tq_delay_list.next, taskq_ent_t, tqent_list); + lowest_id = MIN(lowest_id, t->tqent_id); + } + + if (!list_empty(&tq->tq_active_list)) { + tqt = list_entry(tq->tq_active_list.next, taskq_thread_t, + tqt_active_list); + ASSERT(tqt->tqt_id != 0); + lowest_id = MIN(lowest_id, tqt->tqt_id); + } + + return (lowest_id); +} + +/* + * Insert a task into a list keeping the list sorted by increasing taskqid. + */ +static void +taskq_insert_in_order(taskq_t *tq, taskq_thread_t *tqt) +{ + taskq_thread_t *w; + struct list_head *l; + + ASSERT(tq); + ASSERT(tqt); + ASSERT(spin_is_locked(&tq->tq_lock)); + + list_for_each_prev(l, &tq->tq_active_list) { + w = list_entry(l, taskq_thread_t, tqt_active_list); + if (w->tqt_id < tqt->tqt_id) { + list_add(&tqt->tqt_active_list, l); + break; + } + } + if (l == &tq->tq_active_list) + list_add(&tqt->tqt_active_list, &tq->tq_active_list); +} + +/* + * Find and return a task from the given list if it exists. The list + * must be in lowest to highest task id order. + */ +static taskq_ent_t * +taskq_find_list(taskq_t *tq, struct list_head *lh, taskqid_t id) +{ + struct list_head *l; + taskq_ent_t *t; + + ASSERT(spin_is_locked(&tq->tq_lock)); + + list_for_each(l, lh) { + t = list_entry(l, taskq_ent_t, tqent_list); + + if (t->tqent_id == id) + return (t); + + if (t->tqent_id > id) + break; + } + + return (NULL); +} + +/* + * Find an already dispatched task given the task id regardless of what + * state it is in. If a task is still pending or executing it will be + * returned and 'active' set appropriately. If the task has already + * been run then NULL is returned. + */ +static taskq_ent_t * +taskq_find(taskq_t *tq, taskqid_t id, int *active) +{ + taskq_thread_t *tqt; + struct list_head *l; + taskq_ent_t *t; + + ASSERT(spin_is_locked(&tq->tq_lock)); + *active = 0; + + t = taskq_find_list(tq, &tq->tq_delay_list, id); + if (t) + return (t); + + t = taskq_find_list(tq, &tq->tq_prio_list, id); + if (t) + return (t); + + t = taskq_find_list(tq, &tq->tq_pend_list, id); + if (t) + return (t); + + list_for_each(l, &tq->tq_active_list) { + tqt = list_entry(l, taskq_thread_t, tqt_active_list); + if (tqt->tqt_id == id) { + t = tqt->tqt_task; + *active = 1; + return (t); + } + } + + return (NULL); +} + +/* + * Theory for the taskq_wait_id(), taskq_wait_outstanding(), and + * taskq_wait() functions below. + * + * Taskq waiting is accomplished by tracking the lowest outstanding task + * id and the next available task id. As tasks are dispatched they are + * added to the tail of the pending, priority, or delay lists. As worker + * threads become available the tasks are removed from the heads of these + * lists and linked to the worker threads. This ensures the lists are + * kept sorted by lowest to highest task id. + * + * Therefore the lowest outstanding task id can be quickly determined by + * checking the head item from all of these lists. This value is stored + * with the taskq as the lowest id. It only needs to be recalculated when + * either the task with the current lowest id completes or is canceled. + * + * By blocking until the lowest task id exceeds the passed task id the + * taskq_wait_outstanding() function can be easily implemented. Similarly, + * by blocking until the lowest task id matches the next task id taskq_wait() + * can be implemented. + * + * Callers should be aware that when there are multiple worked threads it + * is possible for larger task ids to complete before smaller ones. Also + * when the taskq contains delay tasks with small task ids callers may + * block for a considerable length of time waiting for them to expire and + * execute. + */ +static int +taskq_wait_id_check(taskq_t *tq, taskqid_t id) +{ + int active = 0; + int rc; + + spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + rc = (taskq_find(tq, id, &active) == NULL); + spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + + return (rc); +} + +/* + * The taskq_wait_id() function blocks until the passed task id completes. + * This does not guarantee that all lower task ids have completed. + */ +void +taskq_wait_id(taskq_t *tq, taskqid_t id) +{ + wait_event(tq->tq_wait_waitq, taskq_wait_id_check(tq, id)); +} +EXPORT_SYMBOL(taskq_wait_id); + +static int +taskq_wait_outstanding_check(taskq_t *tq, taskqid_t id) +{ + int rc; + + spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + rc = (id < tq->tq_lowest_id); + spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + + return (rc); +} + +/* + * The taskq_wait_outstanding() function will block until all tasks with a + * lower taskqid than the passed 'id' have been completed. Note that all + * task id's are assigned monotonically at dispatch time. Zero may be + * passed for the id to indicate all tasks dispatch up to this point, + * but not after, should be waited for. + */ +void +taskq_wait_outstanding(taskq_t *tq, taskqid_t id) +{ + wait_event(tq->tq_wait_waitq, + taskq_wait_outstanding_check(tq, id ? id : tq->tq_next_id - 1)); +} +EXPORT_SYMBOL(taskq_wait_outstanding); + +static int +taskq_wait_check(taskq_t *tq) +{ + int rc; + + spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + rc = (tq->tq_lowest_id == tq->tq_next_id); + spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + + return (rc); +} + +/* + * The taskq_wait() function will block until the taskq is empty. + * This means that if a taskq re-dispatches work to itself taskq_wait() + * callers will block indefinitely. + */ +void +taskq_wait(taskq_t *tq) +{ + wait_event(tq->tq_wait_waitq, taskq_wait_check(tq)); +} +EXPORT_SYMBOL(taskq_wait); + +static int +taskq_member_impl(taskq_t *tq, void *t) +{ + struct list_head *l; + taskq_thread_t *tqt; + int found = 0; + + ASSERT(tq); + ASSERT(t); + ASSERT(spin_is_locked(&tq->tq_lock)); + + list_for_each(l, &tq->tq_thread_list) { + tqt = list_entry(l, taskq_thread_t, tqt_thread_list); + if (tqt->tqt_thread == (struct task_struct *)t) { + found = 1; + break; + } + } + return (found); +} + +int +taskq_member(taskq_t *tq, void *t) +{ + int found; + + spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + found = taskq_member_impl(tq, t); + spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + + return (found); +} +EXPORT_SYMBOL(taskq_member); + +/* + * Cancel an already dispatched task given the task id. Still pending tasks + * will be immediately canceled, and if the task is active the function will + * block until it completes. Preallocated tasks which are canceled must be + * freed by the caller. + */ +int +taskq_cancel_id(taskq_t *tq, taskqid_t id) +{ + taskq_ent_t *t; + int active = 0; + int rc = ENOENT; + + ASSERT(tq); + + spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + t = taskq_find(tq, id, &active); + if (t && !active) { + list_del_init(&t->tqent_list); + t->tqent_flags |= TQENT_FLAG_CANCEL; + + /* + * When canceling the lowest outstanding task id we + * must recalculate the new lowest outstanding id. + */ + if (tq->tq_lowest_id == t->tqent_id) { + tq->tq_lowest_id = taskq_lowest_id(tq); + ASSERT3S(tq->tq_lowest_id, >, t->tqent_id); + } + + /* + * The task_expire() function takes the tq->tq_lock so drop + * drop the lock before synchronously cancelling the timer. + */ + if (timer_pending(&t->tqent_timer)) { + spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + del_timer_sync(&t->tqent_timer); + spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + } + + if (!(t->tqent_flags & TQENT_FLAG_PREALLOC)) + task_done(tq, t); + + rc = 0; + } + spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + + if (active) { + taskq_wait_id(tq, id); + rc = EBUSY; + } + + return (rc); +} +EXPORT_SYMBOL(taskq_cancel_id); + +static int taskq_thread_spawn(taskq_t *tq); + +taskqid_t +taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) +{ + taskq_ent_t *t; + taskqid_t rc = 0; + + ASSERT(tq); + ASSERT(func); + + spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + + /* Taskq being destroyed and all tasks drained */ + if (!(tq->tq_flags & TASKQ_ACTIVE)) + goto out; + + /* Do not queue the task unless there is idle thread for it */ + ASSERT(tq->tq_nactive <= tq->tq_nthreads); + if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) + goto out; + + if ((t = task_alloc(tq, flags)) == NULL) + goto out; + + spin_lock(&t->tqent_lock); + + /* Queue to the priority list instead of the pending list */ + if (flags & TQ_FRONT) + list_add_tail(&t->tqent_list, &tq->tq_prio_list); + else + list_add_tail(&t->tqent_list, &tq->tq_pend_list); + + t->tqent_id = rc = tq->tq_next_id; + tq->tq_next_id++; + t->tqent_func = func; + t->tqent_arg = arg; + t->tqent_taskq = tq; + t->tqent_timer.data = 0; + t->tqent_timer.function = NULL; + t->tqent_timer.expires = 0; + + ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); + + spin_unlock(&t->tqent_lock); + + wake_up(&tq->tq_work_waitq); +out: + /* Spawn additional taskq threads if required. */ + if (tq->tq_nactive == tq->tq_nthreads) + (void) taskq_thread_spawn(tq); + + spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + return (rc); +} +EXPORT_SYMBOL(taskq_dispatch); + +taskqid_t +taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, + uint_t flags, clock_t expire_time) +{ + taskqid_t rc = 0; + taskq_ent_t *t; + + ASSERT(tq); + ASSERT(func); + + spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + + /* Taskq being destroyed and all tasks drained */ + if (!(tq->tq_flags & TASKQ_ACTIVE)) + goto out; + + if ((t = task_alloc(tq, flags)) == NULL) + goto out; + + spin_lock(&t->tqent_lock); + + /* Queue to the delay list for subsequent execution */ + list_add_tail(&t->tqent_list, &tq->tq_delay_list); + + t->tqent_id = rc = tq->tq_next_id; + tq->tq_next_id++; + t->tqent_func = func; + t->tqent_arg = arg; + t->tqent_taskq = tq; + t->tqent_timer.data = (unsigned long)t; + t->tqent_timer.function = task_expire; + t->tqent_timer.expires = (unsigned long)expire_time; + add_timer(&t->tqent_timer); + + ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); + + spin_unlock(&t->tqent_lock); +out: + /* Spawn additional taskq threads if required. */ + if (tq->tq_nactive == tq->tq_nthreads) + (void) taskq_thread_spawn(tq); + spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + return (rc); +} +EXPORT_SYMBOL(taskq_dispatch_delay); + +void +taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, + taskq_ent_t *t) +{ + ASSERT(tq); + ASSERT(func); + + spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + + /* Taskq being destroyed and all tasks drained */ + if (!(tq->tq_flags & TASKQ_ACTIVE)) { + t->tqent_id = 0; + goto out; + } + + spin_lock(&t->tqent_lock); + + /* + * Mark it as a prealloc'd task. This is important + * to ensure that we don't free it later. + */ + t->tqent_flags |= TQENT_FLAG_PREALLOC; + + /* Queue to the priority list instead of the pending list */ + if (flags & TQ_FRONT) + list_add_tail(&t->tqent_list, &tq->tq_prio_list); + else + list_add_tail(&t->tqent_list, &tq->tq_pend_list); + + t->tqent_id = tq->tq_next_id; + tq->tq_next_id++; + t->tqent_func = func; + t->tqent_arg = arg; + t->tqent_taskq = tq; + + spin_unlock(&t->tqent_lock); + + wake_up(&tq->tq_work_waitq); +out: + /* Spawn additional taskq threads if required. */ + if (tq->tq_nactive == tq->tq_nthreads) + (void) taskq_thread_spawn(tq); + spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); +} +EXPORT_SYMBOL(taskq_dispatch_ent); + +int +taskq_empty_ent(taskq_ent_t *t) +{ + return list_empty(&t->tqent_list); +} +EXPORT_SYMBOL(taskq_empty_ent); + +void +taskq_init_ent(taskq_ent_t *t) +{ + spin_lock_init(&t->tqent_lock); + init_waitqueue_head(&t->tqent_waitq); + init_timer(&t->tqent_timer); + INIT_LIST_HEAD(&t->tqent_list); + t->tqent_id = 0; + t->tqent_func = NULL; + t->tqent_arg = NULL; + t->tqent_flags = 0; + t->tqent_taskq = NULL; +} +EXPORT_SYMBOL(taskq_init_ent); + +/* + * Return the next pending task, preference is given to tasks on the + * priority list which were dispatched with TQ_FRONT. + */ +static taskq_ent_t * +taskq_next_ent(taskq_t *tq) +{ + struct list_head *list; + + ASSERT(spin_is_locked(&tq->tq_lock)); + + if (!list_empty(&tq->tq_prio_list)) + list = &tq->tq_prio_list; + else if (!list_empty(&tq->tq_pend_list)) + list = &tq->tq_pend_list; + else + return (NULL); + + return (list_entry(list->next, taskq_ent_t, tqent_list)); +} + +/* + * Spawns a new thread for the specified taskq. + */ +static void +taskq_thread_spawn_task(void *arg) +{ + taskq_t *tq = (taskq_t *)arg; + + (void) taskq_thread_create(tq); + + spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + tq->tq_nspawn--; + spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); +} + +/* + * Spawn addition threads for dynamic taskqs (TASKQ_DYNMAIC) the current + * number of threads is insufficient to handle the pending tasks. These + * new threads must be created by the dedicated dynamic_taskq to avoid + * deadlocks between thread creation and memory reclaim. The system_taskq + * which is also a dynamic taskq cannot be safely used for this. + */ +static int +taskq_thread_spawn(taskq_t *tq) +{ + int spawning = 0; + + if (!(tq->tq_flags & TASKQ_DYNAMIC)) + return (0); + + if ((tq->tq_nthreads + tq->tq_nspawn < tq->tq_maxthreads) && + (tq->tq_flags & TASKQ_ACTIVE)) { + spawning = (++tq->tq_nspawn); + taskq_dispatch(dynamic_taskq, taskq_thread_spawn_task, + tq, TQ_NOSLEEP); + } + + return (spawning); +} + +/* + * Threads in a dynamic taskq should only exit once it has been completely + * drained and no other threads are actively servicing tasks. This prevents + * threads from being created and destroyed more than is required. + * + * The first thread is the thread list is treated as the primary thread. + * There is nothing special about the primary thread but in order to avoid + * all the taskq pids from changing we opt to make it long running. + */ +static int +taskq_thread_should_stop(taskq_t *tq, taskq_thread_t *tqt) +{ + ASSERT(spin_is_locked(&tq->tq_lock)); + + if (!(tq->tq_flags & TASKQ_DYNAMIC)) + return (0); + + if (list_first_entry(&(tq->tq_thread_list), taskq_thread_t, + tqt_thread_list) == tqt) + return (0); + + return + ((tq->tq_nspawn == 0) && /* No threads are being spawned */ + (tq->tq_nactive == 0) && /* No threads are handling tasks */ + (tq->tq_nthreads > 1) && /* More than 1 thread is running */ + (!taskq_next_ent(tq)) && /* There are no pending tasks */ + (spl_taskq_thread_dynamic));/* Dynamic taskqs are allowed */ +} + +static int +taskq_thread(void *args) +{ + DECLARE_WAITQUEUE(wait, current); + sigset_t blocked; + taskq_thread_t *tqt = args; + taskq_t *tq; + taskq_ent_t *t; + int seq_tasks = 0; + + ASSERT(tqt); + tq = tqt->tqt_tq; + current->flags |= PF_NOFREEZE; + + (void) spl_fstrans_mark(); + + sigfillset(&blocked); + sigprocmask(SIG_BLOCK, &blocked, NULL); + flush_signals(current); + + spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + + /* Immediately exit if more threads than allowed were created. */ + if (tq->tq_nthreads >= tq->tq_maxthreads) + goto error; + + tq->tq_nthreads++; + list_add_tail(&tqt->tqt_thread_list, &tq->tq_thread_list); + wake_up(&tq->tq_wait_waitq); + set_current_state(TASK_INTERRUPTIBLE); + + while (!kthread_should_stop()) { + + if (list_empty(&tq->tq_pend_list) && + list_empty(&tq->tq_prio_list)) { + + if (taskq_thread_should_stop(tq, tqt)) { + wake_up_all(&tq->tq_wait_waitq); + break; + } + + add_wait_queue_exclusive(&tq->tq_work_waitq, &wait); + spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + + schedule(); + seq_tasks = 0; + + spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + remove_wait_queue(&tq->tq_work_waitq, &wait); + } else { + __set_current_state(TASK_RUNNING); + } + + if ((t = taskq_next_ent(tq)) != NULL) { + list_del_init(&t->tqent_list); + + /* In order to support recursively dispatching a + * preallocated taskq_ent_t, tqent_id must be + * stored prior to executing tqent_func. */ + tqt->tqt_id = t->tqent_id; + tqt->tqt_task = t; + + /* We must store a copy of the flags prior to + * servicing the task (servicing a prealloc'd task + * returns the ownership of the tqent back to + * the caller of taskq_dispatch). Thus, + * tqent_flags _may_ change within the call. */ + tqt->tqt_flags = t->tqent_flags; + + taskq_insert_in_order(tq, tqt); + tq->tq_nactive++; + spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + + /* Perform the requested task */ + t->tqent_func(t->tqent_arg); + + spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + tq->tq_nactive--; + list_del_init(&tqt->tqt_active_list); + tqt->tqt_task = NULL; + + /* For prealloc'd tasks, we don't free anything. */ + if (!(tqt->tqt_flags & TQENT_FLAG_PREALLOC)) + task_done(tq, t); + + /* When the current lowest outstanding taskqid is + * done calculate the new lowest outstanding id */ + if (tq->tq_lowest_id == tqt->tqt_id) { + tq->tq_lowest_id = taskq_lowest_id(tq); + ASSERT3S(tq->tq_lowest_id, >, tqt->tqt_id); + } + + /* Spawn additional taskq threads if required. */ + if ((++seq_tasks) > spl_taskq_thread_sequential && + taskq_thread_spawn(tq)) + seq_tasks = 0; + + tqt->tqt_id = 0; + tqt->tqt_flags = 0; + wake_up_all(&tq->tq_wait_waitq); + } else { + if (taskq_thread_should_stop(tq, tqt)) + break; + } + + set_current_state(TASK_INTERRUPTIBLE); + + } + + __set_current_state(TASK_RUNNING); + tq->tq_nthreads--; + list_del_init(&tqt->tqt_thread_list); +error: + kmem_free(tqt, sizeof (taskq_thread_t)); + spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + + return (0); +} + +static taskq_thread_t * +taskq_thread_create(taskq_t *tq) +{ + static int last_used_cpu = 0; + taskq_thread_t *tqt; + + tqt = kmem_alloc(sizeof (*tqt), KM_PUSHPAGE); + INIT_LIST_HEAD(&tqt->tqt_thread_list); + INIT_LIST_HEAD(&tqt->tqt_active_list); + tqt->tqt_tq = tq; + tqt->tqt_id = 0; + + tqt->tqt_thread = spl_kthread_create(taskq_thread, tqt, + "%s", tq->tq_name); + if (tqt->tqt_thread == NULL) { + kmem_free(tqt, sizeof (taskq_thread_t)); + return (NULL); + } + + if (spl_taskq_thread_bind) { + last_used_cpu = (last_used_cpu + 1) % num_online_cpus(); + kthread_bind(tqt->tqt_thread, last_used_cpu); + } + + if (spl_taskq_thread_priority) + set_user_nice(tqt->tqt_thread, PRIO_TO_NICE(tq->tq_pri)); + + wake_up_process(tqt->tqt_thread); + + return (tqt); +} + +taskq_t * +taskq_create(const char *name, int nthreads, pri_t pri, + int minalloc, int maxalloc, uint_t flags) +{ + taskq_t *tq; + taskq_thread_t *tqt; + int count = 0, rc = 0, i; + + ASSERT(name != NULL); + ASSERT(minalloc >= 0); + ASSERT(maxalloc <= INT_MAX); + ASSERT(!(flags & (TASKQ_CPR_SAFE))); /* Unsupported */ + + /* Scale the number of threads using nthreads as a percentage */ + if (flags & TASKQ_THREADS_CPU_PCT) { + ASSERT(nthreads <= 100); + ASSERT(nthreads >= 0); + nthreads = MIN(nthreads, 100); + nthreads = MAX(nthreads, 0); + nthreads = MAX((num_online_cpus() * nthreads) / 100, 1); + } + + tq = kmem_alloc(sizeof (*tq), KM_PUSHPAGE); + if (tq == NULL) + return (NULL); + + spin_lock_init(&tq->tq_lock); + INIT_LIST_HEAD(&tq->tq_thread_list); + INIT_LIST_HEAD(&tq->tq_active_list); + tq->tq_name = strdup(name); + tq->tq_nactive = 0; + tq->tq_nthreads = 0; + tq->tq_nspawn = 0; + tq->tq_maxthreads = nthreads; + tq->tq_pri = pri; + tq->tq_minalloc = minalloc; + tq->tq_maxalloc = maxalloc; + tq->tq_nalloc = 0; + tq->tq_flags = (flags | TASKQ_ACTIVE); + tq->tq_next_id = 1; + tq->tq_lowest_id = 1; + INIT_LIST_HEAD(&tq->tq_free_list); + INIT_LIST_HEAD(&tq->tq_pend_list); + INIT_LIST_HEAD(&tq->tq_prio_list); + INIT_LIST_HEAD(&tq->tq_delay_list); + init_waitqueue_head(&tq->tq_work_waitq); + init_waitqueue_head(&tq->tq_wait_waitq); + + if (flags & TASKQ_PREPOPULATE) { + spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + + for (i = 0; i < minalloc; i++) + task_done(tq, task_alloc(tq, TQ_PUSHPAGE | TQ_NEW)); + + spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + } + + if ((flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) + nthreads = 1; + + for (i = 0; i < nthreads; i++) { + tqt = taskq_thread_create(tq); + if (tqt == NULL) + rc = 1; + else + count++; + } + + /* Wait for all threads to be started before potential destroy */ + wait_event(tq->tq_wait_waitq, tq->tq_nthreads == count); + + if (rc) { + taskq_destroy(tq); + tq = NULL; + } + + return (tq); +} +EXPORT_SYMBOL(taskq_create); + +void +taskq_destroy(taskq_t *tq) +{ + struct task_struct *thread; + taskq_thread_t *tqt; + taskq_ent_t *t; + + ASSERT(tq); + spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + tq->tq_flags &= ~TASKQ_ACTIVE; + spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + + /* + * When TASKQ_ACTIVE is clear new tasks may not be added nor may + * new worker threads be spawned for dynamic taskq. + */ + if (dynamic_taskq != NULL) + taskq_wait_outstanding(dynamic_taskq, 0); + + taskq_wait(tq); + + spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + + /* + * Signal each thread to exit and block until it does. Each thread + * is responsible for removing itself from the list and freeing its + * taskq_thread_t. This allows for idle threads to opt to remove + * themselves from the taskq. They can be recreated as needed. + */ + while (!list_empty(&tq->tq_thread_list)) { + tqt = list_entry(tq->tq_thread_list.next, + taskq_thread_t, tqt_thread_list); + thread = tqt->tqt_thread; + spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + + kthread_stop(thread); + + spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); + } + + while (!list_empty(&tq->tq_free_list)) { + t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list); + + ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); + + list_del_init(&t->tqent_list); + task_free(tq, t); + } + + ASSERT0(tq->tq_nthreads); + ASSERT0(tq->tq_nalloc); + ASSERT0(tq->tq_nspawn); + ASSERT(list_empty(&tq->tq_thread_list)); + ASSERT(list_empty(&tq->tq_active_list)); + ASSERT(list_empty(&tq->tq_free_list)); + ASSERT(list_empty(&tq->tq_pend_list)); + ASSERT(list_empty(&tq->tq_prio_list)); + ASSERT(list_empty(&tq->tq_delay_list)); + + spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + + strfree(tq->tq_name); + kmem_free(tq, sizeof (taskq_t)); +} +EXPORT_SYMBOL(taskq_destroy); + +int +spl_taskq_init(void) +{ + system_taskq = taskq_create("spl_system_taskq", MAX(boot_ncpus, 64), + maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); + if (system_taskq == NULL) + return (1); + + dynamic_taskq = taskq_create("spl_dynamic_taskq", 1, + maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE); + if (dynamic_taskq == NULL) { + taskq_destroy(system_taskq); + return (1); + } + + return (0); +} + +void +spl_taskq_fini(void) +{ + taskq_destroy(dynamic_taskq); + dynamic_taskq = NULL; + + taskq_destroy(system_taskq); + system_taskq = NULL; +} diff -Naur spl-0.6.5.7/module/spl/spl-tsd.c spl-0.6.5.7.new/module/spl/spl-tsd.c --- spl-0.6.5.7/module/spl/spl-tsd.c 2016-05-13 04:46:56.000000000 +0200 +++ spl-0.6.5.7.new/module/spl/spl-tsd.c 2016-08-01 16:43:31.475788506 +0200 @@ -528,6 +528,33 @@ EXPORT_SYMBOL(tsd_get); /* + * tsd_get_by_thread - get thread specific data for specified thread + * @key: lookup key + * @thread: thread to lookup + * + * Caller must prevent racing tsd_create() or tsd_destroy(). This + * implementation is designed to be fast and scalable, it does not + * lock the entire table only a single hash bin. + */ +void * +tsd_get_by_thread(uint_t key, kthread_t *thread) +{ + tsd_hash_entry_t *entry; + + ASSERT3P(tsd_hash_table, !=, NULL); + + if ((key == 0) || (key > TSD_KEYS_MAX)) + return (NULL); + + entry = tsd_hash_search(tsd_hash_table, key, thread->pid); + if (entry == NULL) + return (NULL); + + return (entry->he_value); +} +EXPORT_SYMBOL(tsd_get_by_thread); + +/* * tsd_create - create thread specific data key * @keyp: lookup key address * @dtor: destructor called during tsd_destroy() or tsd_exit() diff -Naur spl-0.6.5.7/module/spl/spl-vnode.c spl-0.6.5.7.new/module/spl/spl-vnode.c --- spl-0.6.5.7/module/spl/spl-vnode.c 2016-05-13 04:46:56.000000000 +0200 +++ spl-0.6.5.7.new/module/spl/spl-vnode.c 2016-08-01 16:43:34.278796336 +0200 @@ -222,7 +222,6 @@ ASSERT(vp->v_file); ASSERT(seg == UIO_SYSSPACE); ASSERT((ioflag & ~FAPPEND) == 0); - ASSERT(x2 == RLIM64_INFINITY); fp = vp->v_file; @@ -353,7 +352,8 @@ if (rc) return (ERR_PTR(rc)); - spl_inode_lock(parent.dentry->d_inode); + /* use I_MUTEX_PARENT because vfs_unlink needs it */ + spl_inode_lock_nested(parent.dentry->d_inode, I_MUTEX_PARENT); dentry = lookup_one_len(basename, parent.dentry, len); if (IS_ERR(dentry)) { @@ -572,6 +572,9 @@ offset_t offset, void *x6, void *x7) { int error = EOPNOTSUPP; +#ifdef FALLOC_FL_PUNCH_HOLE + int fstrans; +#endif if (cmd != F_FREESP || bfp->l_whence != 0) return (EOPNOTSUPP); @@ -582,12 +585,24 @@ #ifdef FALLOC_FL_PUNCH_HOLE /* + * May enter XFS which generates a warning when PF_FSTRANS is set. + * To avoid this the flag is cleared over vfs_sync() and then reset. + */ + fstrans = spl_fstrans_check(); + if (fstrans) + current->flags &= ~(PF_FSTRANS); + + /* * When supported by the underlying file system preferentially * use the fallocate() callback to preallocate the space. */ error = -spl_filp_fallocate(vp->v_file, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, bfp->l_start, bfp->l_len); + + if (fstrans) + current->flags |= PF_FSTRANS; + if (error == 0) return (0); #endif @@ -656,6 +671,19 @@ fp = file_find(fd, current); if (fp) { + lfp = fget(fd); + fput(fp->f_file); + /* + * areleasef() can cause us to see a stale reference when + * userspace has reused a file descriptor before areleasef() + * has run. fput() the stale reference and replace it. We + * retain the original reference count such that the concurrent + * areleasef() will decrement its reference and terminate. + */ + if (lfp != fp->f_file) { + fp->f_file = lfp; + fp->f_vnode->v_file = lfp; + } atomic_inc(&fp->f_ref); spin_unlock(&vn_file_lock); return (fp); diff -Naur spl-0.6.5.7/module/splat/splat-kmem.c spl-0.6.5.7.new/module/splat/splat-kmem.c --- spl-0.6.5.7/module/splat/splat-kmem.c 2016-05-13 04:46:57.000000000 +0200 +++ spl-0.6.5.7.new/module/splat/splat-kmem.c 2016-08-01 16:43:22.175762529 +0200 @@ -590,6 +590,9 @@ kmem_cache_data_t **kcd = NULL; int i, rc = 0, objs = 0; + /* Limit size for low memory machines (1/128 of memory) */ + size = MIN(size, (physmem * PAGE_SIZE) >> 7); + splat_vprint(file, name, "Testing size=%d, align=%d, flags=0x%04x\n", size, align, flags); @@ -619,7 +622,7 @@ * it to a single slab for the purposes of this test. */ #ifdef _LP64 - objs = SPL_KMEM_CACHE_OBJ_PER_SLAB * 4; + objs = kcp->kcp_cache->skc_slab_objs * 4; #else objs = 1; #endif diff -Naur spl-0.6.5.7/module/splat/splat-rwlock.c spl-0.6.5.7.new/module/splat/splat-rwlock.c --- spl-0.6.5.7/module/splat/splat-rwlock.c 2016-05-13 04:46:57.000000000 +0200 +++ spl-0.6.5.7.new/module/splat/splat-rwlock.c 2016-08-01 16:43:34.280796341 +0200 @@ -55,8 +55,12 @@ #define SPLAT_RWLOCK_TEST5_DESC "Write downgrade" #define SPLAT_RWLOCK_TEST6_ID 0x0706 -#define SPLAT_RWLOCK_TEST6_NAME "rw_tryupgrade" -#define SPLAT_RWLOCK_TEST6_DESC "Read upgrade" +#define SPLAT_RWLOCK_TEST6_NAME "rw_tryupgrade-1" +#define SPLAT_RWLOCK_TEST6_DESC "rwsem->count value" + +#define SPLAT_RWLOCK_TEST7_ID 0x0707 +#define SPLAT_RWLOCK_TEST7_NAME "rw_tryupgrade-2" +#define SPLAT_RWLOCK_TEST7_DESC "Read upgrade" #define SPLAT_RWLOCK_TEST_MAGIC 0x115599DDUL #define SPLAT_RWLOCK_TEST_NAME "rwlock_test" @@ -580,19 +584,65 @@ splat_init_rw_priv(rwp, file); rw_enter(&rwp->rw_rwlock, RW_READER); - if (!RW_READ_HELD(&rwp->rw_rwlock)) { + if (RWSEM_COUNT(SEM(&rwp->rw_rwlock)) != + SPL_RWSEM_SINGLE_READER_VALUE) { + splat_vprint(file, SPLAT_RWLOCK_TEST6_NAME, + "We assumed single reader rwsem->count " + "should be %ld, but is %ld\n", + SPL_RWSEM_SINGLE_READER_VALUE, + RWSEM_COUNT(SEM(&rwp->rw_rwlock))); + rc = -ENOLCK; + goto out; + } + rw_exit(&rwp->rw_rwlock); + + rw_enter(&rwp->rw_rwlock, RW_WRITER); + if (RWSEM_COUNT(SEM(&rwp->rw_rwlock)) != + SPL_RWSEM_SINGLE_WRITER_VALUE) { splat_vprint(file, SPLAT_RWLOCK_TEST6_NAME, + "We assumed single writer rwsem->count " + "should be %ld, but is %ld\n", + SPL_RWSEM_SINGLE_WRITER_VALUE, + RWSEM_COUNT(SEM(&rwp->rw_rwlock))); + rc = -ENOLCK; + goto out; + } + rc = 0; + splat_vprint(file, SPLAT_RWLOCK_TEST6_NAME, "%s", + "rwsem->count same as we assumed\n"); +out: + rw_exit(&rwp->rw_rwlock); + rw_destroy(&rwp->rw_rwlock); + kfree(rwp); + + return rc; +} + +static int +splat_rwlock_test7(struct file *file, void *arg) +{ + rw_priv_t *rwp; + int rc; + + rwp = (rw_priv_t *)kmalloc(sizeof(*rwp), GFP_KERNEL); + if (rwp == NULL) + return -ENOMEM; + + splat_init_rw_priv(rwp, file); + + rw_enter(&rwp->rw_rwlock, RW_READER); + if (!RW_READ_HELD(&rwp->rw_rwlock)) { + splat_vprint(file, SPLAT_RWLOCK_TEST7_NAME, "rwlock should be read lock: %d\n", RW_READ_HELD(&rwp->rw_rwlock)); rc = -ENOLCK; goto out; } -#if defined(CONFIG_RWSEM_GENERIC_SPINLOCK) /* With one reader upgrade should never fail. */ rc = rw_tryupgrade(&rwp->rw_rwlock); if (!rc) { - splat_vprint(file, SPLAT_RWLOCK_TEST6_NAME, + splat_vprint(file, SPLAT_RWLOCK_TEST7_NAME, "rwlock failed upgrade from reader: %d\n", RW_READ_HELD(&rwp->rw_rwlock)); rc = -ENOLCK; @@ -600,7 +650,7 @@ } if (RW_READ_HELD(&rwp->rw_rwlock) || !RW_WRITE_HELD(&rwp->rw_rwlock)) { - splat_vprint(file, SPLAT_RWLOCK_TEST6_NAME, "rwlock should " + splat_vprint(file, SPLAT_RWLOCK_TEST7_NAME, "rwlock should " "have 0 (not %d) reader and 1 (not %d) writer\n", RW_READ_HELD(&rwp->rw_rwlock), RW_WRITE_HELD(&rwp->rw_rwlock)); @@ -608,13 +658,8 @@ } rc = 0; - splat_vprint(file, SPLAT_RWLOCK_TEST6_NAME, "%s", + splat_vprint(file, SPLAT_RWLOCK_TEST7_NAME, "%s", "rwlock properly upgraded\n"); -#else - rc = 0; - splat_vprint(file, SPLAT_RWLOCK_TEST6_NAME, "%s", - "rw_tryupgrade() is disabled for this arch\n"); -#endif out: rw_exit(&rwp->rw_rwlock); rw_destroy(&rwp->rw_rwlock); @@ -652,6 +697,8 @@ SPLAT_RWLOCK_TEST5_ID, splat_rwlock_test5); SPLAT_TEST_INIT(sub, SPLAT_RWLOCK_TEST6_NAME, SPLAT_RWLOCK_TEST6_DESC, SPLAT_RWLOCK_TEST6_ID, splat_rwlock_test6); + SPLAT_TEST_INIT(sub, SPLAT_RWLOCK_TEST7_NAME, SPLAT_RWLOCK_TEST7_DESC, + SPLAT_RWLOCK_TEST7_ID, splat_rwlock_test7); return sub; } @@ -660,6 +707,7 @@ splat_rwlock_fini(splat_subsystem_t *sub) { ASSERT(sub); + SPLAT_TEST_FINI(sub, SPLAT_RWLOCK_TEST7_ID); SPLAT_TEST_FINI(sub, SPLAT_RWLOCK_TEST6_ID); SPLAT_TEST_FINI(sub, SPLAT_RWLOCK_TEST5_ID); SPLAT_TEST_FINI(sub, SPLAT_RWLOCK_TEST4_ID); diff -Naur spl-0.6.5.7/module/splat/splat-taskq.c spl-0.6.5.7.new/module/splat/splat-taskq.c --- spl-0.6.5.7/module/splat/splat-taskq.c 2016-05-13 04:46:57.000000000 +0200 +++ spl-0.6.5.7.new/module/splat/splat-taskq.c 2016-08-01 16:43:34.280796341 +0200 @@ -1040,11 +1040,12 @@ error = (tq_arg->depth == SPLAT_TASKQ_DEPTH_MAX ? 0 : -EINVAL); + splat_vprint(file, SPLAT_TASKQ_TEST7_NAME, + "Taskq '%s' destroying\n", tq_arg->name); + kmem_free(tqe, sizeof (taskq_ent_t)); kmem_free(tq_arg, sizeof (splat_taskq_arg_t)); - splat_vprint(file, SPLAT_TASKQ_TEST7_NAME, - "Taskq '%s' destroying\n", tq_arg->name); taskq_destroy(tq); return (error); diff -Naur spl-0.6.5.7/rpm/generic/spl-dkms.spec.in.orig spl-0.6.5.7.new/rpm/generic/spl-dkms.spec.in.orig --- spl-0.6.5.7/rpm/generic/spl-dkms.spec.in.orig 1970-01-01 01:00:00.000000000 +0100 +++ spl-0.6.5.7.new/rpm/generic/spl-dkms.spec.in.orig 2016-08-01 16:43:14.783741881 +0200 @@ -0,0 +1,71 @@ +%{?!packager: %define packager Brian Behlendorf } + +%define module @PACKAGE@ +%define mkconf scripts/dkms.mkconf + +Name: %{module}-dkms + +Version: @VERSION@ +Release: @RELEASE@%{?dist} +Summary: Kernel module(s) (dkms) + +Group: System Environment/Kernel +License: GPLv2+ +URL: http://zfsonlinux.org/ +Source0: %{module}-%{version}.tar.gz +BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) +BuildArch: noarch + +Requires: dkms >= 2.2.0.2 +Requires: gcc, make, perl +Requires: kernel-devel +Provides: %{module}-kmod = %{version} + +%description +This package contains the dkms kernel modules required to emulate +several interfaces provided by the Solaris kernel. + +%prep +%setup -q -n %{module}-%{version} + +%build +%{mkconf} -n %{module} -v %{version} -f dkms.conf + +%install +if [ "$RPM_BUILD_ROOT" != "/" ]; then + rm -rf $RPM_BUILD_ROOT +fi +mkdir -p $RPM_BUILD_ROOT/usr/src/ +cp -rf ${RPM_BUILD_DIR}/%{module}-%{version} $RPM_BUILD_ROOT/usr/src/ + +%clean +if [ "$RPM_BUILD_ROOT" != "/" ]; then + rm -rf $RPM_BUILD_ROOT +fi + +%files +%defattr(-,root,root) +/usr/src/%{module}-%{version} + +%post +for POSTINST in /usr/lib/dkms/common.postinst; do + if [ -f $POSTINST ]; then + $POSTINST %{module} %{version} + exit $? + fi + echo "WARNING: $POSTINST does not exist." +done +echo -e "ERROR: DKMS version is too old and %{module} was not" +echo -e "built with legacy DKMS support." +echo -e "You must either rebuild %{module} with legacy postinst" +echo -e "support or upgrade DKMS to a more current version." +exit 1 + +%preun +echo -e "Uninstall of %{module} module (version %{version}) beginning:" +dkms remove -m %{module} -v %{version} --all --rpm_safe_upgrade +exit 0 + +%changelog +* %(date "+%a %b %d %Y") %packager %{version}-%{release} +- Automatic build by DKMS diff -Naur spl-0.6.5.7/rpm/generic/spl.spec.in spl-0.6.5.7.new/rpm/generic/spl.spec.in --- spl-0.6.5.7/rpm/generic/spl.spec.in 2016-05-13 04:06:38.000000000 +0200 +++ spl-0.6.5.7.new/rpm/generic/spl.spec.in 2016-08-01 16:43:23.436766051 +0200 @@ -28,6 +28,7 @@ %files %doc AUTHORS COPYING DISCLAIMER +%{_bindir}/* %{_sbindir}/* %{_mandir}/man1/* %{_mandir}/man5/* diff -Naur spl-0.6.5.7/rpm/generic/spl.spec.in.orig spl-0.6.5.7.new/rpm/generic/spl.spec.in.orig --- spl-0.6.5.7/rpm/generic/spl.spec.in.orig 1970-01-01 01:00:00.000000000 +0100 +++ spl-0.6.5.7.new/rpm/generic/spl.spec.in.orig 2016-05-13 04:06:38.000000000 +0200 @@ -0,0 +1,73 @@ +Name: @PACKAGE@ +Version: @VERSION@ +Release: @RELEASE@%{?dist} +Summary: Commands to control the kernel modules + +Group: System Environment/Kernel +License: GPLv2+ +URL: http://zfsonlinux.org/ +Source0: %{name}-%{version}.tar.gz +BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) +Requires: %{name}-kmod = %{version} +Provides: %{name}-kmod-common = %{version} + +%description +This package contains the commands to verify the SPL +kernel modules are functioning properly. + +%prep +%setup -q + +%build +%configure --with-config=user +make %{?_smp_mflags} + +%install +%{__rm} -rf $RPM_BUILD_ROOT +make install DESTDIR=%{?buildroot} + +%files +%doc AUTHORS COPYING DISCLAIMER +%{_sbindir}/* +%{_mandir}/man1/* +%{_mandir}/man5/* + +%changelog +* Thu May 12 2016 Ned Bass - 0.6.5.7-1 +- Fix PPC build failure zfsonlinux/spl#516 +* Tue Mar 22 2016 Ned Bass - 0.6.5.6-1 +- Remove artificial architecture restrictions in packaging +- Add support for s390[x] zfsonlinux/spl#537 +* Wed Mar 9 2016 Ned Bass - 0.6.5.5-1 +- Linux 4.5 compatibility zfsonlinux/spl#524 +- Create working debuginfo packages on Red Hat zfsonlinux/zfs#4224 +- Allow copy-builtin to run multiple times zfsonlinux/spl#526 +- Use safer flags for in-kernel memory allocations zfsonlinux/spl#523 +- Fix potential deadlock in cv_wait() zfsonlinux/zfs#4106 +- Fix livelock in shrinker zfsonlinux/zfs#3936 +* Fri Jan 8 2016 Ned Bass - 0.6.5.4-1 +- Build fixes on SPARC and some kernels +- Fix taskq dynamic spawning deadlock +- Fix builtin kernel builds +- Fix crash due to overflow in P2ROUNDUP macro +- Fix deadlock during direct memory reclaim +* Tue Oct 13 2015 Ned Bass - 0.6.5.3-1 +- Fix CPU hotplug zfsonlinux/spl#482 +- Disable dynamic taskqs by default to avoid deadlock zfsonlinux/spl#484 +* Tue Sep 29 2015 Ned Bass - 0.6.5.2-1 +- Released 0.6.5.2-1 +- Fix PAX Patch/Grsec SLAB_USERCOPY panic zfsonlinux/zfs#3796 +- Always remove during dkms uninstall/update zfsonlinux/spl#476 +* Thu Sep 19 2015 Ned Bass - 0.6.5.1-1 +- Released 0.6.5.1-1, no changes from spl-0.6.5 +* Thu Sep 10 2015 Brian Behlendorf - 0.6.5-1 +- Released 0.6.5-1, detailed release notes are available at: +- https://github.com/zfsonlinux/zfs/releases/tag/zfs-0.6.5 +* Wed Apr 8 2015 Brian Behlendorf - 0.6.4-1 +- Released 0.6.4-1 +* Thu Jun 12 2014 Brian Behlendorf - 0.6.3-1 +- Released 0.6.3-1 +* Wed Aug 21 2013 Brian Behlendorf - 0.6.2-1 +- Released 0.6.2-1 +* Fri Mar 22 2013 Brian Behlendorf - 0.6.1-1 +- First official stable release.