diff --git a/drivers/Kconfig b/drivers/Kconfig index c6c2eb1..d6fbcbf 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -39,5 +39,6 @@ source "drivers/crypto/Kconfig" source "drivers/memory/Kconfig" source "drivers/soc/imx/Kconfig" +source "drivers/nvme/Kconfig" endmenu diff --git a/drivers/Makefile b/drivers/Makefile index 752fd66..65fd488 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -39,3 +39,4 @@ obj-$(CONFIG_AIODEV) += aiodev/ obj-y += memory/ obj-y += soc/imx/ +obj-y += nvme/ diff --git a/drivers/nvme/Kconfig b/drivers/nvme/Kconfig new file mode 100644 index 0000000..27ac965 --- /dev/null +++ b/drivers/nvme/Kconfig @@ -0,0 +1,5 @@ +menu "NVME Support" + +source "drivers/nvme/host/Kconfig" + +endmenu diff --git a/drivers/nvme/Makefile b/drivers/nvme/Makefile new file mode 100644 index 0000000..6d7d51c --- /dev/null +++ b/drivers/nvme/Makefile @@ -0,0 +1 @@ +obj-y += host/ diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig new file mode 100644 index 0000000..8888c89 --- /dev/null +++ b/drivers/nvme/host/Kconfig @@ -0,0 +1,11 @@ +config NVME_CORE + bool + +config BLK_DEV_NVME + bool "NVM Express block device" + depends on PCI && BLOCK + select NVME_CORE + ---help--- + The NVM Express driver is for solid state drives directly + connected to the PCI or PCI Express bus. If you know you + don't have one of these, it is safe to answer N. diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile new file mode 100644 index 0000000..9afbc0d --- /dev/null +++ b/drivers/nvme/host/Makefile @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 + +ccflags-y += -I$(src) + +obj-$(CONFIG_NVME_CORE) += nvme-core.o +obj-$(CONFIG_BLK_DEV_NVME) += nvme.o + +nvme-core-y := core.o +nvme-y += pci.o diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c new file mode 100644 index 0000000..e098470 --- /dev/null +++ b/drivers/nvme/host/core.c @@ -0,0 +1,614 @@ +#include + +#include "nvme.h" + +int __nvme_submit_sync_cmd(struct nvme_ctrl *ctrl, + struct nvme_command *cmd, + union nvme_result *result, + void *buffer, unsigned bufflen, + unsigned timeout, int qid) +{ + return ctrl->ops->submit_sync_cmd(ctrl, cmd, result, buffer, bufflen, + timeout, qid); +} +EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd); + +int nvme_submit_sync_cmd(struct nvme_ctrl *ctrl, + struct nvme_command *cmd, + void *buffer, unsigned bufflen) +{ + return __nvme_submit_sync_cmd(ctrl, cmd, NULL, buffer, bufflen, 0, + NVME_QID_ADMIN); +} +EXPORT_SYMBOL_GPL(nvme_sec_submit); + +static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) +{ + struct nvme_command c = { }; + int error; + + /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ + c.identify.opcode = nvme_admin_identify; + c.identify.cns = NVME_ID_CNS_CTRL; + + *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL); + if (!*id) + return -ENOMEM; + + error = nvme_submit_sync_cmd(dev, &c, *id, + sizeof(struct nvme_id_ctrl)); + if (error) + kfree(*id); + + return error; +} + +static int +nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, + void *buffer, size_t buflen, u32 *result) +{ + struct nvme_command c; + union nvme_result res; + int ret; + + memset(&c, 0, sizeof(c)); + c.features.opcode = nvme_admin_set_features; + c.features.fid = cpu_to_le32(fid); + c.features.dword11 = cpu_to_le32(dword11); + + ret = __nvme_submit_sync_cmd(dev, &c, &res, buffer, buflen, 0, + NVME_QID_ADMIN); + if (ret >= 0 && result) + *result = le32_to_cpu(res.u32); + return ret; +} + +int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) +{ + u32 q_count = (*count - 1) | ((*count - 1) << 16); + u32 result; + int status, nr_io_queues; + + status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0, + &result); + if (status < 0) + return status; + + /* + * Degraded controllers might return an error when setting the queue + * count. We still want to be able to bring them online and offer + * access to the admin queue, as that might be only way to fix them up. + */ + if (status > 0) { + dev_err(ctrl->dev, "Could not set queue count (%d)\n", status); + *count = 0; + } else { + nr_io_queues = min(result & 0xffff, result >> 16) + 1; + *count = min(*count, nr_io_queues); + } + + return 0; +} +EXPORT_SYMBOL_GPL(nvme_set_queue_count); + +static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) +{ + uint64_t start = get_time_ns(); + unsigned long timeout = + ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2); + u32 csts, bit = enabled ? NVME_CSTS_RDY : 0; + int ret; + + while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { + if (csts == ~0) + return -ENODEV; + if ((csts & NVME_CSTS_RDY) == bit) + break; + + mdelay(100); + + if (is_timeout(start, timeout)) { + dev_err(ctrl->dev, + "Device not ready; aborting %s\n", enabled ? + "initialisation" : "reset"); + return -ENODEV; + } + } + + return ret; +} + +static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list) +{ + struct nvme_command c = { }; + + c.identify.opcode = nvme_admin_identify; + c.identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST; + c.identify.nsid = cpu_to_le32(nsid); + return nvme_submit_sync_cmd(dev, &c, ns_list, NVME_IDENTIFY_DATA_SIZE); +} + +static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl, + unsigned nsid) +{ + struct nvme_id_ns *id; + struct nvme_command c = { }; + int error; + + /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ + c.identify.opcode = nvme_admin_identify; + c.identify.nsid = cpu_to_le32(nsid); + c.identify.cns = NVME_ID_CNS_NS; + + id = kmalloc(sizeof(*id), GFP_KERNEL); + if (!id) + return NULL; + + error = nvme_submit_sync_cmd(ctrl, &c, id, sizeof(*id)); + if (error) { + dev_warn(ctrl->dev, "Identify namespace failed\n"); + kfree(id); + return NULL; + } + + return id; +} + +static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, + unsigned nsid, struct nvme_id_ns *id) +{ + static int instance = 1; + struct nvme_ns_head *head; + int ret = -ENOMEM; + + head = kzalloc(sizeof(*head), GFP_KERNEL); + if (!head) + goto out; + + head->instance = instance++; + head->ns_id = nsid; + + return head; +out: + return ERR_PTR(ret); +} + +static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, + struct nvme_id_ns *id) +{ + struct nvme_ctrl *ctrl = ns->ctrl; + const bool is_shared = id->nmic & (1 << 0); + struct nvme_ns_head *head = NULL; + + if (is_shared) { + dev_info(ctrl->dev, "Skipping shared namespace %u\n", nsid); + return -ENOTSUPP; + } + + head = nvme_alloc_ns_head(ctrl, nsid, id); + if (IS_ERR(head)) + return PTR_ERR(head); + + ns->head = head; + + return 0; +} + +#define DISK_NAME_LEN 32 + +static void nvme_update_disk_info(struct block_device *blk, struct nvme_ns *ns, + struct nvme_id_ns *id) +{ + blk->blockbits = ns->lba_shift; + blk->num_blocks = le64_to_cpup(&id->nsze); + + ns->readonly = id->nsattr & (1 << 0); +} + +static void __nvme_revalidate_disk(struct block_device *blk, + struct nvme_id_ns *id) +{ + struct nvme_ns *ns = to_nvme_ns(blk); + + /* + * If identify namespace failed, use default 512 byte block size so + * block layer can use before failing read/write for 0 capacity. + */ + ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds; + if (ns->lba_shift == 0) + ns->lba_shift = 9; + + nvme_update_disk_info(blk, ns, id); +} + +static void nvme_setup_rw(struct nvme_ns *ns, struct nvme_command *cmnd, + int block, int num_block) +{ + cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id); + cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, block)); + cmnd->rw.length = cpu_to_le16(num_block - 1); + cmnd->rw.control = 0; + cmnd->rw.dsmgmt = 0; +} + +static void nvme_setup_flush(struct nvme_ns *ns, struct nvme_command *cmnd) +{ + memset(cmnd, 0, sizeof(*cmnd)); + cmnd->common.opcode = nvme_cmd_flush; + cmnd->common.nsid = cpu_to_le32(ns->head->ns_id); +} + +static int nvme_submit_sync_rw(struct nvme_ns *ns, struct nvme_command *cmnd, + void *buffer, int block, int num_blocks) +{ + /* + * ns->ctrl->max_hw_sectors is in units of 512 bytes, so we + * need to make sure we adjust it to discovered lba_shift + */ + const u32 max_hw_sectors = + ns->ctrl->max_hw_sectors >> (ns->lba_shift - 9); + int ret; + + if (num_blocks > max_hw_sectors) { + while (num_blocks) { + const int chunk = min_t(int, num_blocks, + max_hw_sectors); + + ret = nvme_submit_sync_rw(ns, cmnd, buffer, block, + chunk); + if (ret) + break; + + num_blocks -= chunk; + buffer += chunk; + block += chunk; + } + + return ret; + } + + nvme_setup_rw(ns, cmnd, block, num_blocks); + + ret = __nvme_submit_sync_cmd(ns->ctrl, cmnd, NULL, buffer, + num_blocks << ns->lba_shift, + 0, NVME_QID_IO); + + if (ret) { + dev_err(ns->ctrl->dev, + "I/O failed: block: %d, num blocks: %d, status code type: %xh, status code %02xh\n", + block, num_blocks, (ret >> 8) & 0xf, + ret & 0xff); + return -EIO; + } + + return 0; +} + + +static int nvme_block_device_read(struct block_device *blk, void *buffer, + int block, int num_blocks) +{ + struct nvme_ns *ns = to_nvme_ns(blk); + struct nvme_command cmnd = { }; + + cmnd.rw.opcode = nvme_cmd_read; + + return nvme_submit_sync_rw(ns, &cmnd, buffer, block, num_blocks); +} + +static int __maybe_unused +nvme_block_device_write(struct block_device *blk, const void *buffer, + int block, int num_blocks) +{ + struct nvme_ns *ns = to_nvme_ns(blk); + struct nvme_command cmnd = { }; + + if (ns->readonly) + return -EINVAL; + + cmnd.rw.opcode = nvme_cmd_write; + + return nvme_submit_sync_rw(ns, &cmnd, (void *)buffer, block, + num_blocks); +} + +static int __maybe_unused nvme_block_device_flush(struct block_device *blk) +{ + struct nvme_ns *ns = to_nvme_ns(blk); + struct nvme_command cmnd = { }; + + nvme_setup_flush(ns, &cmnd); + + return __nvme_submit_sync_cmd(ns->ctrl, &cmnd, NULL, NULL, + 0, 0, NVME_QID_IO); +} + +static struct block_device_ops nvme_block_device_ops = { + .read = nvme_block_device_read, +#ifdef CONFIG_BLOCK_WRITE + .write = nvme_block_device_write, + .flush = nvme_block_device_flush, +#endif +}; + +static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) +{ + struct nvme_ns *ns; + struct nvme_id_ns *id; + char disk_name[DISK_NAME_LEN]; + int ret, flags; + + ns = kzalloc(sizeof(*ns), GFP_KERNEL); + if (!ns) + return; + + ns->ctrl = ctrl; + ns->lba_shift = 9; /* set to a default value for 512 until + * disk is validated */ + + id = nvme_identify_ns(ctrl, nsid); + if (!id) + goto out_free_ns; + + if (id->ncap == 0) + goto out_free_id; + + if (nvme_init_ns_head(ns, nsid, id)) + goto out_free_id; + + nvme_set_disk_name(disk_name, ns, ctrl, &flags); + + ns->blk.dev = ctrl->dev; + ns->blk.ops = &nvme_block_device_ops; + ns->blk.cdev.name = strdup(disk_name); + + __nvme_revalidate_disk(&ns->blk, id); + kfree(id); + + ret = blockdevice_register(&ns->blk); + if (ret) { + dev_err(ctrl->dev, "Cannot register block device (%d)\n", ret); + goto out_free_id; + } + + return; +out_free_id: + kfree(id); +out_free_ns: + kfree(ns); +} + +static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn) +{ + __le32 *ns_list; + unsigned i, j, nsid, prev = 0, num_lists = DIV_ROUND_UP(nn, 1024); + int ret = 0; + + ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL); + if (!ns_list) + return -ENOMEM; + + for (i = 0; i < num_lists; i++) { + ret = nvme_identify_ns_list(ctrl, prev, ns_list); + if (ret) + goto out; + + for (j = 0; j < min(nn, 1024U); j++) { + nsid = le32_to_cpu(ns_list[j]); + if (!nsid) + goto out; + + nvme_alloc_ns(ctrl, nsid); + } + nn -= j; + } + out: + kfree(ns_list); + return ret; +} + +static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl, unsigned nn) +{ + unsigned i; + + for (i = 1; i <= nn; i++) + nvme_alloc_ns(ctrl, i); +} + +static void nvme_scan_work(struct nvme_ctrl *ctrl) +{ + struct nvme_id_ctrl *id; + unsigned nn; + + if (nvme_identify_ctrl(ctrl, &id)) + return; + + nn = le32_to_cpu(id->nn); + if (ctrl->vs >= NVME_VS(1, 1, 0)) { + if (!nvme_scan_ns_list(ctrl, nn)) + goto out_free_id; + } + nvme_scan_ns_sequential(ctrl, nn); +out_free_id: + kfree(id); +} + +void nvme_start_ctrl(struct nvme_ctrl *ctrl) +{ + if (ctrl->queue_count > 1) + nvme_scan_work(ctrl); +} +EXPORT_SYMBOL_GPL(nvme_start_ctrl); + +/* + * If the device has been passed off to us in an enabled state, just clear + * the enabled bit. The spec says we should set the 'shutdown notification + * bits', but doing so may cause the device to complete commands to the + * admin queue ... and we don't know what memory that might be pointing at! + */ +int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap) +{ + int ret; + + ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; + ctrl->ctrl_config &= ~NVME_CC_ENABLE; + + ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); + if (ret) + return ret; + + return nvme_wait_ready(ctrl, cap, false); +} +EXPORT_SYMBOL_GPL(nvme_disable_ctrl); + +int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap) +{ + /* + * Default to a 4K page size, with the intention to update this + * path in the future to accomodate architectures with differing + * kernel and IO page sizes. + */ + unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12; + int ret; + + if (page_shift < dev_page_min) { + dev_err(ctrl->dev, + "Minimum device page size %u too large for host (%u)\n", + 1 << dev_page_min, 1 << page_shift); + return -ENODEV; + } + + ctrl->page_size = 1 << page_shift; + + ctrl->ctrl_config = NVME_CC_CSS_NVM; + ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; + ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE; + ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; + ctrl->ctrl_config |= NVME_CC_ENABLE; + + ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); + if (ret) + return ret; + return nvme_wait_ready(ctrl, cap, true); +} +EXPORT_SYMBOL_GPL(nvme_enable_ctrl); + +int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) +{ + uint64_t start = get_time_ns(); + unsigned long timeout = SHUTDOWN_TIMEOUT; + u32 csts; + int ret; + + ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; + ctrl->ctrl_config |= NVME_CC_SHN_NORMAL; + + ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); + if (ret) + return ret; + + while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { + if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT) + break; + + mdelay(100); + + if (is_timeout(start, timeout)) { + dev_err(ctrl->dev, + "Device shutdown incomplete; abort shutdown\n"); + return -ENODEV; + } + } + + return ret; +} +EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl); + +#define NVME_ID_MAX_LEN 41 + +static void nvme_print(struct nvme_ctrl *ctrl, const char *prefix, + const char *_string, size_t _length) +{ + char string[NVME_ID_MAX_LEN]; + const size_t length = min(_length, sizeof(string) - 1); + + memcpy(string, _string, length); + string[length - 1] = '\0'; + + dev_info(ctrl->dev, "%s: %s\n", prefix, string); +} + +static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) +{ + nvme_print(ctrl, "serial", id->sn, sizeof(id->sn)); + nvme_print(ctrl, "model", id->mn, sizeof(id->mn)); + nvme_print(ctrl, "firmware", id->fr, sizeof(id->fr)); + + return 0; +} + +/* + * Initialize the cached copies of the Identify data and various controller + * register in our nvme_ctrl structure. This should be called as soon as + * the admin queue is fully up and running. + */ +int nvme_init_identify(struct nvme_ctrl *ctrl) +{ + struct nvme_id_ctrl *id; + u64 cap; + int ret, page_shift; + u32 max_hw_sectors; + + ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs); + if (ret) { + dev_err(ctrl->dev, "Reading VS failed (%d)\n", ret); + return ret; + } + + ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap); + if (ret) { + dev_err(ctrl->dev, "Reading CAP failed (%d)\n", ret); + return ret; + } + page_shift = NVME_CAP_MPSMIN(cap) + 12; + + ret = nvme_identify_ctrl(ctrl, &id); + if (ret) { + dev_err(ctrl->dev, "Identify Controller failed (%d)\n", ret); + return -EIO; + } + + ret = nvme_init_subsystem(ctrl, id); + if (ret) + return ret; + + if (id->mdts) + max_hw_sectors = 1 << (id->mdts + page_shift - 9); + else + max_hw_sectors = UINT_MAX; + ctrl->max_hw_sectors = + min_not_zero(ctrl->max_hw_sectors, max_hw_sectors); + + kfree(id); + return 0; +} +EXPORT_SYMBOL_GPL(nvme_init_identify); + + +/* + * Initialize a NVMe controller structures. This needs to be called during + * earliest initialization so that we have the initialized structured around + * during probing. + */ +int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device_d *dev, + const struct nvme_ctrl_ops *ops) +{ + static int instance = 0; + + ctrl->dev = dev; + ctrl->ops = ops; + ctrl->instance = instance++; + + return 0; +} +EXPORT_SYMBOL_GPL(nvme_init_ctrl); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h new file mode 100644 index 0000000..4ec4aef --- /dev/null +++ b/drivers/nvme/host/nvme.h @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2011-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _NVME_H +#define _NVME_H + +#include +#include +#include + +#define ADMIN_TIMEOUT (60 * HZ) +#define SHUTDOWN_TIMEOUT ( 5 * HZ) + +/* + * Common request structure for NVMe passthrough. All drivers must have + * this structure as the first member of their request-private data. + */ +struct nvme_request { + struct nvme_command *cmd; + union nvme_result result; + u16 status; + + void *buffer; + unsigned int buffer_len; + dma_addr_t buffer_dma_addr; + enum dma_data_direction dma_dir; +}; + +struct nvme_ctrl { + const struct nvme_ctrl_ops *ops; + struct device_d *dev; + int instance; + + u32 ctrl_config; + u32 queue_count; + u64 cap; + u32 page_size; + u32 max_hw_sectors; + u32 vs; +}; + +/* + * Anchor structure for namespaces. There is one for each namespace in a + * NVMe subsystem that any of our controllers can see, and the namespace + * structure for each controller is chained of it. For private namespaces + * there is a 1:1 relation to our namespace structures, that is ->list + * only ever has a single entry for private namespaces. + */ +struct nvme_ns_head { + unsigned ns_id; + int instance; +}; + +struct nvme_ns { + struct nvme_ctrl *ctrl; + struct nvme_ns_head *head; + struct block_device blk; + + int lba_shift; + bool readonly; +}; + +static inline struct nvme_ns *to_nvme_ns(struct block_device *blk) +{ + return container_of(blk, struct nvme_ns, blk); +} + +struct nvme_ctrl_ops { + int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); + int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); + int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); + + int (*submit_sync_cmd)(struct nvme_ctrl *ctrl, + struct nvme_command *cmd, + union nvme_result *result, + void *buffer, + unsigned bufflen, + unsigned timeout, int qid); +}; + +static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl) +{ + u32 val = 0; + + if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &val)) + return false; + return val & NVME_CSTS_RDY; +} + +static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector) +{ + return (sector >> (ns->lba_shift - 9)); +} + +static inline void nvme_end_request(struct nvme_request *rq, __le16 status, + union nvme_result result) +{ + rq->status = le16_to_cpu(status) >> 1; + rq->result = result; +} + +int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap); +int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap); +int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl); +int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device_d *dev, + const struct nvme_ctrl_ops *ops); +void nvme_start_ctrl(struct nvme_ctrl *ctrl); +int nvme_init_identify(struct nvme_ctrl *ctrl); + +enum nvme_queue_id { + NVME_QID_ADMIN, + NVME_QID_IO, + NVME_QID_NUM, + NVME_QID_ANY = -1, +}; + +int __nvme_submit_sync_cmd(struct nvme_ctrl *ctrl, + struct nvme_command *cmd, + union nvme_result *result, + void *buffer, unsigned bufflen, + unsigned timeout, int qid); +int nvme_submit_sync_cmd(struct nvme_ctrl *ctrl, + struct nvme_command *cmd, + void *buffer, unsigned bufflen); + + +int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); +/* + * Without the multipath code enabled, multiple controller per subsystems are + * visible as devices and thus we cannot use the subsystem instance. + */ +static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, + struct nvme_ctrl *ctrl, int *flags) +{ + sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); +} + +#endif /* _NVME_H */ diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c new file mode 100644 index 0000000..387bc45 --- /dev/null +++ b/drivers/nvme/host/pci.c @@ -0,0 +1,697 @@ + +#include +#include +#include +#include +#include + +#include + +#include "nvme.h" + +#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) +#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) + +#define NVME_MAX_KB_SZ 4096 + +static int io_queue_depth = 2; + +struct nvme_dev; + +/* + * An NVM Express queue. Each device has at least two (one for admin + * commands and one for I/O commands). + */ +struct nvme_queue { + struct nvme_dev *dev; + struct nvme_request *req; + struct nvme_command *sq_cmds; + volatile struct nvme_completion *cqes; + dma_addr_t sq_dma_addr; + dma_addr_t cq_dma_addr; + u32 __iomem *q_db; + u16 q_depth; + u16 sq_tail; + u16 cq_head; + u16 qid; + u8 cq_phase; + + u16 counter; +}; + +/* + * Represents an NVM Express device. Each nvme_dev is a PCI function. + */ +struct nvme_dev { + struct nvme_queue queues[NVME_QID_NUM]; + u32 __iomem *dbs; + struct device_d *dev; + unsigned online_queues; + unsigned max_qid; + int q_depth; + u32 db_stride; + void __iomem *bar; + bool subsystem; + struct nvme_ctrl ctrl; + __le64 *prp_pool; + unsigned int prp_pool_size; + dma_addr_t prp_dma; +}; + +static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl) +{ + return container_of(ctrl, struct nvme_dev, ctrl); +} + +static int nvme_pci_setup_prps(struct nvme_dev *dev, + const struct nvme_request *req, + struct nvme_rw_command *cmnd) +{ + int length = req->buffer_len; + const int page_size = dev->ctrl.page_size; + dma_addr_t dma_addr = req->buffer_dma_addr; + u32 offset = dma_addr & (page_size - 1); + u64 prp1 = dma_addr; + __le64 *prp_list; + int i, nprps; + dma_addr_t prp_dma; + + + length -= (page_size - offset); + if (length <= 0) { + prp_dma = 0; + goto done; + } + + dma_addr += (page_size - offset); + + if (length <= page_size) { + prp_dma = dma_addr; + goto done; + } + + nprps = DIV_ROUND_UP(length, page_size); + if (nprps > dev->prp_pool_size) { + dma_free_coherent(dev->prp_pool, dev->prp_dma, + dev->prp_pool_size * sizeof(u64)); + dev->prp_pool_size = nprps; + dev->prp_pool = dma_alloc_coherent(nprps * sizeof(u64), + &dev->prp_dma); + } + + prp_list = dev->prp_pool; + prp_dma = dev->prp_dma; + + i = 0; + for (;;) { + if (i == page_size >> 3) { + __le64 *old_prp_list = prp_list; + prp_list = &prp_list[i]; + prp_dma += page_size; + prp_list[0] = old_prp_list[i - 1]; + old_prp_list[i - 1] = cpu_to_le64(prp_dma); + i = 1; + } + + prp_list[i++] = cpu_to_le64(dma_addr); + dma_addr += page_size; + length -= page_size; + if (length <= 0) + break; + } + +done: + cmnd->dptr.prp1 = cpu_to_le64(prp1); + cmnd->dptr.prp2 = cpu_to_le64(prp_dma); + + return 0; +} + +static int nvme_map_data(struct nvme_dev *dev, struct nvme_request *req) +{ + if (!req->buffer || !req->buffer_len) + return 0; + + req->buffer_dma_addr = dma_map_single(dev->dev, req->buffer, + req->buffer_len, req->dma_dir); + if (dma_mapping_error(dev->dev, req->buffer_dma_addr)) + return -EFAULT; + + return nvme_pci_setup_prps(dev, req, &req->cmd->rw); +} + +static void nvme_unmap_data(struct nvme_dev *dev, struct nvme_request *req) +{ + if (!req->buffer || !req->buffer_len) + return; + + dma_unmap_single(dev->dev, req->buffer_dma_addr, req->buffer_len, + req->dma_dir); +} + +static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth) +{ + struct nvme_queue *nvmeq = &dev->queues[qid]; + + if (dev->ctrl.queue_count > qid) + return 0; + + nvmeq->cqes = dma_alloc_coherent(CQ_SIZE(depth), + &nvmeq->cq_dma_addr); + if (!nvmeq->cqes) + goto free_nvmeq; + + nvmeq->sq_cmds = dma_alloc_coherent(SQ_SIZE(depth), + &nvmeq->sq_dma_addr); + if (!nvmeq->sq_cmds) + goto free_cqdma; + + nvmeq->dev = dev; + nvmeq->cq_head = 0; + nvmeq->cq_phase = 1; + nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; + nvmeq->q_depth = depth; + nvmeq->qid = qid; + dev->ctrl.queue_count++; + + return 0; + + free_cqdma: + dma_free_coherent((void *)nvmeq->cqes, nvmeq->cq_dma_addr, + CQ_SIZE(depth)); + free_nvmeq: + return -ENOMEM; +} + +static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) +{ + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + c.delete_queue.opcode = opcode; + c.delete_queue.qid = cpu_to_le16(id); + + return nvme_submit_sync_cmd(&dev->ctrl, &c, NULL, 0); +} + +static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, + struct nvme_queue *nvmeq, s16 vector) +{ + struct nvme_command c; + int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; + + /* + * Note: we (ab)use the fact that the prp fields survive if no data + * is attached to the request. + */ + memset(&c, 0, sizeof(c)); + c.create_cq.opcode = nvme_admin_create_cq; + c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr); + c.create_cq.cqid = cpu_to_le16(qid); + c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); + c.create_cq.cq_flags = cpu_to_le16(flags); + c.create_cq.irq_vector = cpu_to_le16(vector); + + return nvme_submit_sync_cmd(&dev->ctrl, &c, NULL, 0); +} + +static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, + struct nvme_queue *nvmeq) +{ + struct nvme_command c; + int flags = NVME_QUEUE_PHYS_CONTIG; + + /* + * Note: we (ab)use the fact that the prp fields survive if no data + * is attached to the request. + */ + memset(&c, 0, sizeof(c)); + c.create_sq.opcode = nvme_admin_create_sq; + c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr); + c.create_sq.sqid = cpu_to_le16(qid); + c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1); + c.create_sq.sq_flags = cpu_to_le16(flags); + c.create_sq.cqid = cpu_to_le16(qid); + + return nvme_submit_sync_cmd(&dev->ctrl, &c, NULL, 0); +} + +static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) +{ + return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid); +} + +static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) +{ + struct nvme_dev *dev = nvmeq->dev; + + nvmeq->sq_tail = 0; + nvmeq->cq_head = 0; + nvmeq->cq_phase = 1; + nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; + dev->online_queues++; +} + +static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) +{ + struct nvme_dev *dev = nvmeq->dev; + int result; + s16 vector; + + vector = 0; + result = adapter_alloc_cq(dev, qid, nvmeq, vector); + if (result) + return result; + + result = adapter_alloc_sq(dev, qid, nvmeq); + if (result < 0) + return result; + else if (result) + goto release_cq; + + nvme_init_queue(nvmeq, qid); + + return result; + +release_cq: + adapter_delete_cq(dev, qid); + return result; +} + +/** + * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell + * @nvmeq: The queue to use + * @cmd: The command to send + */ +static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) +{ + memcpy(&nvmeq->sq_cmds[nvmeq->sq_tail], cmd, sizeof(*cmd)); + + if (++nvmeq->sq_tail == nvmeq->q_depth) + nvmeq->sq_tail = 0; + writel(nvmeq->sq_tail, nvmeq->q_db); +} + +/* We read the CQE phase first to check if the rest of the entry is valid */ +static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq) +{ + return (le16_to_cpu(nvmeq->cqes[nvmeq->cq_head].status) & 1) == + nvmeq->cq_phase; +} + +static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq) +{ + u16 head = nvmeq->cq_head; + + writel(head, nvmeq->q_db + nvmeq->dev->db_stride); +} + +static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) +{ + volatile struct nvme_completion *cqe = &nvmeq->cqes[idx]; + struct nvme_request *req = nvmeq->req; + + if (unlikely(cqe->command_id >= nvmeq->q_depth)) { + dev_warn(nvmeq->dev->ctrl.dev, + "invalid id %d completed on queue %d\n", + cqe->command_id, le16_to_cpu(cqe->sq_id)); + return; + } + + if (WARN_ON(cqe->command_id != req->cmd->common.command_id)) + return; + + nvme_end_request(req, cqe->status, cqe->result); +} + +static void nvme_complete_cqes(struct nvme_queue *nvmeq, u16 start, u16 end) +{ + while (start != end) { + nvme_handle_cqe(nvmeq, start); + if (++start == nvmeq->q_depth) + start = 0; + } +} + +static inline void nvme_update_cq_head(struct nvme_queue *nvmeq) +{ + if (++nvmeq->cq_head == nvmeq->q_depth) { + nvmeq->cq_head = 0; + nvmeq->cq_phase = !nvmeq->cq_phase; + } +} + +static inline bool nvme_process_cq(struct nvme_queue *nvmeq, u16 *start, + u16 *end, int tag) +{ + bool found = false; + + *start = nvmeq->cq_head; + while (!found && nvme_cqe_pending(nvmeq)) { + if (nvmeq->cqes[nvmeq->cq_head].command_id == tag) + found = true; + nvme_update_cq_head(nvmeq); + } + *end = nvmeq->cq_head; + + if (*start != *end) + nvme_ring_cq_doorbell(nvmeq); + return found; +} + +static bool nvme_poll(struct nvme_queue *nvmeq, unsigned int tag) +{ + u16 start, end; + bool found; + + if (!nvme_cqe_pending(nvmeq)) + return false; + + found = nvme_process_cq(nvmeq, &start, &end, tag); + + nvme_complete_cqes(nvmeq, start, end); + return found; +} + +static int nvme_pci_submit_sync_cmd(struct nvme_ctrl *ctrl, + struct nvme_command *cmd, + union nvme_result *result, + void *buffer, + unsigned int buffer_len, + unsigned timeout, int qid) +{ + struct nvme_dev *dev = to_nvme_dev(ctrl); + struct nvme_queue *nvmeq = &dev->queues[qid]; + struct nvme_request req = { }; + const u16 tag = nvmeq->counter++ & (nvmeq->q_depth - 1); + enum dma_data_direction dma_dir; + int ret; + + switch (qid) { + case NVME_QID_ADMIN: + switch (cmd->common.opcode) { + case nvme_admin_create_sq: + case nvme_admin_create_cq: + case nvme_admin_delete_sq: + case nvme_admin_delete_cq: + case nvme_admin_set_features: + dma_dir = DMA_TO_DEVICE; + break; + case nvme_admin_identify: + dma_dir = DMA_FROM_DEVICE; + break; + default: + return -EINVAL; + } + break; + case NVME_QID_IO: + switch (cmd->rw.opcode) { + case nvme_cmd_write: + dma_dir = DMA_TO_DEVICE; + break; + case nvme_cmd_read: + dma_dir = DMA_FROM_DEVICE; + break; + default: + return -EINVAL; + } + break; + default: + return -EINVAL; + } + + cmd->common.command_id = tag; + + timeout = timeout ?: ADMIN_TIMEOUT; + + req.cmd = cmd; + req.buffer = buffer; + req.buffer_len = buffer_len; + req.dma_dir = dma_dir; + + ret = nvme_map_data(dev, &req); + if (ret) { + dev_err(dev->dev, "Failed to map request data\n"); + return ret; + } + + nvme_submit_cmd(nvmeq, cmd); + + nvmeq->req = &req; + ret = wait_on_timeout(timeout, nvme_poll(nvmeq, tag)); + nvmeq->req = NULL; + + nvme_unmap_data(dev, &req); + + if (result) + *result = req.result; + + return ret ?: req.status; +} + +static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) +{ + int result; + u32 aqa; + struct nvme_queue *nvmeq; + + dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ? + NVME_CAP_NSSRC(dev->ctrl.cap) : 0; + + if (dev->subsystem && + (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO)) + writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS); + + result = nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap); + if (result < 0) + return result; + + result = nvme_alloc_queue(dev, NVME_QID_ADMIN, NVME_AQ_DEPTH); + if (result) + return result; + + nvmeq = &dev->queues[NVME_QID_ADMIN]; + aqa = nvmeq->q_depth - 1; + aqa |= aqa << 16; + + writel(aqa, dev->bar + NVME_REG_AQA); + writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ); + writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ); + + result = nvme_enable_ctrl(&dev->ctrl, dev->ctrl.cap); + if (result) + return result; + + nvme_init_queue(nvmeq, NVME_QID_ADMIN); + + return result; +} + +static int nvme_create_io_queues(struct nvme_dev *dev) +{ + unsigned i, max; + int ret = 0; + + for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) { + if (nvme_alloc_queue(dev, i, dev->q_depth)) { + ret = -ENOMEM; + break; + } + } + + max = min(dev->max_qid, dev->ctrl.queue_count - 1); + for (i = dev->online_queues; i <= max; i++) { + ret = nvme_create_queue(&dev->queues[i], i); + if (ret) + break; + } + + /* + * Ignore failing Create SQ/CQ commands, we can continue with less + * than the desired amount of queues, and even a controller without + * I/O queues can still be used to issue admin commands. This might + * be useful to upgrade a buggy firmware for example. + */ + return ret >= 0 ? 0 : ret; +} + +static int nvme_setup_io_queues(struct nvme_dev *dev) +{ + int result, nr_io_queues; + + nr_io_queues = NVME_QID_NUM - 1; + result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); + if (result < 0) + return result; + + dev->max_qid = nr_io_queues; + + return nvme_create_io_queues(dev); +} + +static int nvme_pci_enable(struct nvme_dev *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev->dev); + + if (pci_enable_device(pdev)) + return -ENOMEM; + + pci_set_master(pdev); + + if (readl(dev->bar + NVME_REG_CSTS) == -1) + return -ENODEV; + + dev->ctrl.cap = readq(dev->bar + NVME_REG_CAP); + + dev->q_depth = min_t(int, NVME_CAP_MQES(dev->ctrl.cap) + 1, + io_queue_depth); + dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap); + dev->dbs = dev->bar + 4096; + + return 0; +} + +static void nvme_reset_work(struct nvme_dev *dev) +{ + int result = -ENODEV; + + result = nvme_pci_enable(dev); + if (result) + goto out; + + result = nvme_pci_configure_admin_queue(dev); + if (result) + goto out; + + /* + * Limit the max command size to prevent iod->sg allocations going + * over a single page. + */ + dev->ctrl.max_hw_sectors = NVME_MAX_KB_SZ << 1; + + result = nvme_init_identify(&dev->ctrl); + if (result) + goto out; + + result = nvme_setup_io_queues(dev); + if (result) { + dev_err(dev->ctrl.dev, "IO queues not created\n"); + goto out; + } + + nvme_start_ctrl(&dev->ctrl); +out: + return; +} + +static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) +{ + *val = readl(to_nvme_dev(ctrl)->bar + off); + return 0; +} + +static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val) +{ + writel(val, to_nvme_dev(ctrl)->bar + off); + return 0; +} + +static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) +{ + *val = readq(to_nvme_dev(ctrl)->bar + off); + return 0; +} + +static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { + .reg_read32 = nvme_pci_reg_read32, + .reg_write32 = nvme_pci_reg_write32, + .reg_read64 = nvme_pci_reg_read64, + .submit_sync_cmd = nvme_pci_submit_sync_cmd, +}; + +static void nvme_dev_map(struct nvme_dev *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev->dev); + + dev->bar = pci_iomap(pdev, 0); +} + +static void nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) +{ + int ret; + ret = adapter_delete_queue(nvmeq->dev, opcode, nvmeq->qid); + if (ret < 0) + dev_err(nvmeq->dev->dev, "%s: %s\n", __func__, + strerror(-ret)); + else if (ret) + dev_err(nvmeq->dev->dev, + "%s: status code type: %xh, status code %02xh\n", + __func__, (ret >> 8) & 0xf, ret & 0xff); +} + +static void nvme_disable_io_queues(struct nvme_dev *dev) +{ + int i, queues = dev->online_queues - 1; + + for (i = queues; i > 0; i--) { + nvme_delete_queue(&dev->queues[i], nvme_admin_delete_sq); + nvme_delete_queue(&dev->queues[i], nvme_admin_delete_cq); + } +} + +static void nvme_disable_admin_queue(struct nvme_dev *dev) +{ + struct nvme_queue *nvmeq = &dev->queues[0]; + u16 start, end; + + nvme_shutdown_ctrl(&dev->ctrl); + nvme_process_cq(nvmeq, &start, &end, -1); + nvme_complete_cqes(nvmeq, start, end); +} + +static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct nvme_dev *dev; + int result; + + dev = xzalloc(sizeof(*dev)); + dev->dev = &pdev->dev; + pdev->dev.priv = dev; + + nvme_dev_map(dev); + result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops); + if (result) + return result; + + nvme_reset_work(dev); + + return 0; +} + +static void nvme_remove(struct pci_dev *pdev) +{ + struct nvme_dev *dev = pdev->dev.priv; + bool dead = true; + + u32 csts = readl(dev->bar + NVME_REG_CSTS); + + dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY)); + + if (!dead && dev->ctrl.queue_count > 0) { + nvme_disable_io_queues(dev); + nvme_disable_admin_queue(dev); + } +} + +static const struct pci_device_id nvme_id_table[] = { + { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, PCI_ANY_ID) }, + { 0, }, +}; + +static struct pci_driver nvme_driver = { + .name = "nvme", + .id_table = nvme_id_table, + .probe = nvme_probe, + .remove = nvme_remove, +}; +device_pci_driver(nvme_driver); diff --git a/include/linux/nvme.h b/include/linux/nvme.h new file mode 100644 index 0000000..818dbe9 --- /dev/null +++ b/include/linux/nvme.h @@ -0,0 +1,1271 @@ +/* + * Definitions for the NVM Express interface + * Copyright (c) 2011-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _LINUX_NVME_H +#define _LINUX_NVME_H + +#include +#include + +/* NQN names in commands fields specified one size */ +#define NVMF_NQN_FIELD_LEN 256 + +/* However the max length of a qualified name is another size */ +#define NVMF_NQN_SIZE 223 + +#define NVMF_TRSVCID_SIZE 32 +#define NVMF_TRADDR_SIZE 256 +#define NVMF_TSAS_SIZE 256 + +#define NVME_DISC_SUBSYS_NAME "nqn.2014-08.org.nvmexpress.discovery" + +#define NVME_RDMA_IP_PORT 4420 + +#define NVME_NSID_ALL 0xffffffff + +enum nvme_subsys_type { + NVME_NQN_DISC = 1, /* Discovery type target subsystem */ + NVME_NQN_NVME = 2, /* NVME type target subsystem */ +}; + +/* Address Family codes for Discovery Log Page entry ADRFAM field */ +enum { + NVMF_ADDR_FAMILY_PCI = 0, /* PCIe */ + NVMF_ADDR_FAMILY_IP4 = 1, /* IP4 */ + NVMF_ADDR_FAMILY_IP6 = 2, /* IP6 */ + NVMF_ADDR_FAMILY_IB = 3, /* InfiniBand */ + NVMF_ADDR_FAMILY_FC = 4, /* Fibre Channel */ +}; + +/* Transport Type codes for Discovery Log Page entry TRTYPE field */ +enum { + NVMF_TRTYPE_RDMA = 1, /* RDMA */ + NVMF_TRTYPE_FC = 2, /* Fibre Channel */ + NVMF_TRTYPE_LOOP = 254, /* Reserved for host usage */ + NVMF_TRTYPE_MAX, +}; + +/* Transport Requirements codes for Discovery Log Page entry TREQ field */ +enum { + NVMF_TREQ_NOT_SPECIFIED = 0, /* Not specified */ + NVMF_TREQ_REQUIRED = 1, /* Required */ + NVMF_TREQ_NOT_REQUIRED = 2, /* Not Required */ +}; + +/* RDMA QP Service Type codes for Discovery Log Page entry TSAS + * RDMA_QPTYPE field + */ +enum { + NVMF_RDMA_QPTYPE_CONNECTED = 1, /* Reliable Connected */ + NVMF_RDMA_QPTYPE_DATAGRAM = 2, /* Reliable Datagram */ +}; + +/* RDMA QP Service Type codes for Discovery Log Page entry TSAS + * RDMA_QPTYPE field + */ +enum { + NVMF_RDMA_PRTYPE_NOT_SPECIFIED = 1, /* No Provider Specified */ + NVMF_RDMA_PRTYPE_IB = 2, /* InfiniBand */ + NVMF_RDMA_PRTYPE_ROCE = 3, /* InfiniBand RoCE */ + NVMF_RDMA_PRTYPE_ROCEV2 = 4, /* InfiniBand RoCEV2 */ + NVMF_RDMA_PRTYPE_IWARP = 5, /* IWARP */ +}; + +/* RDMA Connection Management Service Type codes for Discovery Log Page + * entry TSAS RDMA_CMS field + */ +enum { + NVMF_RDMA_CMS_RDMA_CM = 1, /* Sockets based endpoint addressing */ +}; + +#define NVME_AQ_DEPTH 32 +#define NVME_NR_AEN_COMMANDS 1 +#define NVME_AQ_BLK_MQ_DEPTH (NVME_AQ_DEPTH - NVME_NR_AEN_COMMANDS) + +/* + * Subtract one to leave an empty queue entry for 'Full Queue' condition. See + * NVM-Express 1.2 specification, section 4.1.2. + */ +#define NVME_AQ_MQ_TAG_DEPTH (NVME_AQ_BLK_MQ_DEPTH - 1) + +enum { + NVME_REG_CAP = 0x0000, /* Controller Capabilities */ + NVME_REG_VS = 0x0008, /* Version */ + NVME_REG_INTMS = 0x000c, /* Interrupt Mask Set */ + NVME_REG_INTMC = 0x0010, /* Interrupt Mask Clear */ + NVME_REG_CC = 0x0014, /* Controller Configuration */ + NVME_REG_CSTS = 0x001c, /* Controller Status */ + NVME_REG_NSSR = 0x0020, /* NVM Subsystem Reset */ + NVME_REG_AQA = 0x0024, /* Admin Queue Attributes */ + NVME_REG_ASQ = 0x0028, /* Admin SQ Base Address */ + NVME_REG_ACQ = 0x0030, /* Admin CQ Base Address */ + NVME_REG_CMBLOC = 0x0038, /* Controller Memory Buffer Location */ + NVME_REG_CMBSZ = 0x003c, /* Controller Memory Buffer Size */ + NVME_REG_DBS = 0x1000, /* SQ 0 Tail Doorbell */ +}; + +#define NVME_CAP_MQES(cap) ((cap) & 0xffff) +#define NVME_CAP_TIMEOUT(cap) (((cap) >> 24) & 0xff) +#define NVME_CAP_STRIDE(cap) (((cap) >> 32) & 0xf) +#define NVME_CAP_NSSRC(cap) (((cap) >> 36) & 0x1) +#define NVME_CAP_MPSMIN(cap) (((cap) >> 48) & 0xf) +#define NVME_CAP_MPSMAX(cap) (((cap) >> 52) & 0xf) + +#define NVME_CMB_BIR(cmbloc) ((cmbloc) & 0x7) +#define NVME_CMB_OFST(cmbloc) (((cmbloc) >> 12) & 0xfffff) + +enum { + NVME_CMBSZ_SQS = 1 << 0, + NVME_CMBSZ_CQS = 1 << 1, + NVME_CMBSZ_LISTS = 1 << 2, + NVME_CMBSZ_RDS = 1 << 3, + NVME_CMBSZ_WDS = 1 << 4, + + NVME_CMBSZ_SZ_SHIFT = 12, + NVME_CMBSZ_SZ_MASK = 0xfffff, + + NVME_CMBSZ_SZU_SHIFT = 8, + NVME_CMBSZ_SZU_MASK = 0xf, +}; + +/* + * Submission and Completion Queue Entry Sizes for the NVM command set. + * (In bytes and specified as a power of two (2^n)). + */ +#define NVME_NVM_IOSQES 6 +#define NVME_NVM_IOCQES 4 + +enum { + NVME_CC_ENABLE = 1 << 0, + NVME_CC_CSS_NVM = 0 << 4, + NVME_CC_EN_SHIFT = 0, + NVME_CC_CSS_SHIFT = 4, + NVME_CC_MPS_SHIFT = 7, + NVME_CC_AMS_SHIFT = 11, + NVME_CC_SHN_SHIFT = 14, + NVME_CC_IOSQES_SHIFT = 16, + NVME_CC_IOCQES_SHIFT = 20, + NVME_CC_AMS_RR = 0 << NVME_CC_AMS_SHIFT, + NVME_CC_AMS_WRRU = 1 << NVME_CC_AMS_SHIFT, + NVME_CC_AMS_VS = 7 << NVME_CC_AMS_SHIFT, + NVME_CC_SHN_NONE = 0 << NVME_CC_SHN_SHIFT, + NVME_CC_SHN_NORMAL = 1 << NVME_CC_SHN_SHIFT, + NVME_CC_SHN_ABRUPT = 2 << NVME_CC_SHN_SHIFT, + NVME_CC_SHN_MASK = 3 << NVME_CC_SHN_SHIFT, + NVME_CC_IOSQES = NVME_NVM_IOSQES << NVME_CC_IOSQES_SHIFT, + NVME_CC_IOCQES = NVME_NVM_IOCQES << NVME_CC_IOCQES_SHIFT, + NVME_CSTS_RDY = 1 << 0, + NVME_CSTS_CFS = 1 << 1, + NVME_CSTS_NSSRO = 1 << 4, + NVME_CSTS_PP = 1 << 5, + NVME_CSTS_SHST_NORMAL = 0 << 2, + NVME_CSTS_SHST_OCCUR = 1 << 2, + NVME_CSTS_SHST_CMPLT = 2 << 2, + NVME_CSTS_SHST_MASK = 3 << 2, +}; + +struct nvme_id_power_state { + __le16 max_power; /* centiwatts */ + __u8 rsvd2; + __u8 flags; + __le32 entry_lat; /* microseconds */ + __le32 exit_lat; /* microseconds */ + __u8 read_tput; + __u8 read_lat; + __u8 write_tput; + __u8 write_lat; + __le16 idle_power; + __u8 idle_scale; + __u8 rsvd19; + __le16 active_power; + __u8 active_work_scale; + __u8 rsvd23[9]; +}; + +enum { + NVME_PS_FLAGS_MAX_POWER_SCALE = 1 << 0, + NVME_PS_FLAGS_NON_OP_STATE = 1 << 1, +}; + +struct nvme_id_ctrl { + __le16 vid; + __le16 ssvid; + char sn[20]; + char mn[40]; + char fr[8]; + __u8 rab; + __u8 ieee[3]; + __u8 cmic; + __u8 mdts; + __le16 cntlid; + __le32 ver; + __le32 rtd3r; + __le32 rtd3e; + __le32 oaes; + __le32 ctratt; + __u8 rsvd100[156]; + __le16 oacs; + __u8 acl; + __u8 aerl; + __u8 frmw; + __u8 lpa; + __u8 elpe; + __u8 npss; + __u8 avscc; + __u8 apsta; + __le16 wctemp; + __le16 cctemp; + __le16 mtfa; + __le32 hmpre; + __le32 hmmin; + __u8 tnvmcap[16]; + __u8 unvmcap[16]; + __le32 rpmbs; + __le16 edstt; + __u8 dsto; + __u8 fwug; + __le16 kas; + __le16 hctma; + __le16 mntmt; + __le16 mxtmt; + __le32 sanicap; + __le32 hmminds; + __le16 hmmaxd; + __u8 rsvd338[4]; + __u8 anatt; + __u8 anacap; + __le32 anagrpmax; + __le32 nanagrpid; + __u8 rsvd352[160]; + __u8 sqes; + __u8 cqes; + __le16 maxcmd; + __le32 nn; + __le16 oncs; + __le16 fuses; + __u8 fna; + __u8 vwc; + __le16 awun; + __le16 awupf; + __u8 nvscc; + __u8 nwpc; + __le16 acwu; + __u8 rsvd534[2]; + __le32 sgls; + __le32 mnan; + __u8 rsvd544[224]; + char subnqn[256]; + __u8 rsvd1024[768]; + __le32 ioccsz; + __le32 iorcsz; + __le16 icdoff; + __u8 ctrattr; + __u8 msdbd; + __u8 rsvd1804[244]; + struct nvme_id_power_state psd[32]; + __u8 vs[1024]; +}; + +enum { + NVME_CTRL_ONCS_COMPARE = 1 << 0, + NVME_CTRL_ONCS_WRITE_UNCORRECTABLE = 1 << 1, + NVME_CTRL_ONCS_DSM = 1 << 2, + NVME_CTRL_ONCS_WRITE_ZEROES = 1 << 3, + NVME_CTRL_ONCS_TIMESTAMP = 1 << 6, + NVME_CTRL_VWC_PRESENT = 1 << 0, + NVME_CTRL_OACS_SEC_SUPP = 1 << 0, + NVME_CTRL_OACS_DIRECTIVES = 1 << 5, + NVME_CTRL_OACS_DBBUF_SUPP = 1 << 8, + NVME_CTRL_LPA_CMD_EFFECTS_LOG = 1 << 1, +}; + +struct nvme_lbaf { + __le16 ms; + __u8 ds; + __u8 rp; +}; + +struct nvme_id_ns { + __le64 nsze; + __le64 ncap; + __le64 nuse; + __u8 nsfeat; + __u8 nlbaf; + __u8 flbas; + __u8 mc; + __u8 dpc; + __u8 dps; + __u8 nmic; + __u8 rescap; + __u8 fpi; + __u8 rsvd33; + __le16 nawun; + __le16 nawupf; + __le16 nacwu; + __le16 nabsn; + __le16 nabo; + __le16 nabspf; + __le16 noiob; + __u8 nvmcap[16]; + __u8 rsvd64[28]; + __le32 anagrpid; + __u8 rsvd96[3]; + __u8 nsattr; + __u8 rsvd100[4]; + __u8 nguid[16]; + __u8 eui64[8]; + struct nvme_lbaf lbaf[16]; + __u8 rsvd192[192]; + __u8 vs[3712]; +}; + +enum { + NVME_ID_CNS_NS = 0x00, + NVME_ID_CNS_CTRL = 0x01, + NVME_ID_CNS_NS_ACTIVE_LIST = 0x02, + NVME_ID_CNS_NS_DESC_LIST = 0x03, + NVME_ID_CNS_NS_PRESENT_LIST = 0x10, + NVME_ID_CNS_NS_PRESENT = 0x11, + NVME_ID_CNS_CTRL_NS_LIST = 0x12, + NVME_ID_CNS_CTRL_LIST = 0x13, +}; + +enum { + NVME_DIR_IDENTIFY = 0x00, + NVME_DIR_STREAMS = 0x01, + NVME_DIR_SND_ID_OP_ENABLE = 0x01, + NVME_DIR_SND_ST_OP_REL_ID = 0x01, + NVME_DIR_SND_ST_OP_REL_RSC = 0x02, + NVME_DIR_RCV_ID_OP_PARAM = 0x01, + NVME_DIR_RCV_ST_OP_PARAM = 0x01, + NVME_DIR_RCV_ST_OP_STATUS = 0x02, + NVME_DIR_RCV_ST_OP_RESOURCE = 0x03, + NVME_DIR_ENDIR = 0x01, +}; + +enum { + NVME_NS_FEAT_THIN = 1 << 0, + NVME_NS_FLBAS_LBA_MASK = 0xf, + NVME_NS_FLBAS_META_EXT = 0x10, + NVME_LBAF_RP_BEST = 0, + NVME_LBAF_RP_BETTER = 1, + NVME_LBAF_RP_GOOD = 2, + NVME_LBAF_RP_DEGRADED = 3, + NVME_NS_DPC_PI_LAST = 1 << 4, + NVME_NS_DPC_PI_FIRST = 1 << 3, + NVME_NS_DPC_PI_TYPE3 = 1 << 2, + NVME_NS_DPC_PI_TYPE2 = 1 << 1, + NVME_NS_DPC_PI_TYPE1 = 1 << 0, + NVME_NS_DPS_PI_FIRST = 1 << 3, + NVME_NS_DPS_PI_MASK = 0x7, + NVME_NS_DPS_PI_TYPE1 = 1, + NVME_NS_DPS_PI_TYPE2 = 2, + NVME_NS_DPS_PI_TYPE3 = 3, +}; + +struct nvme_ns_id_desc { + __u8 nidt; + __u8 nidl; + __le16 reserved; +}; + +#define NVME_NIDT_EUI64_LEN 8 +#define NVME_NIDT_NGUID_LEN 16 +#define NVME_NIDT_UUID_LEN 16 + +enum { + NVME_NIDT_EUI64 = 0x01, + NVME_NIDT_NGUID = 0x02, + NVME_NIDT_UUID = 0x03, +}; + +struct nvme_smart_log { + __u8 critical_warning; + __u8 temperature[2]; + __u8 avail_spare; + __u8 spare_thresh; + __u8 percent_used; + __u8 rsvd6[26]; + __u8 data_units_read[16]; + __u8 data_units_written[16]; + __u8 host_reads[16]; + __u8 host_writes[16]; + __u8 ctrl_busy_time[16]; + __u8 power_cycles[16]; + __u8 power_on_hours[16]; + __u8 unsafe_shutdowns[16]; + __u8 media_errors[16]; + __u8 num_err_log_entries[16]; + __le32 warning_temp_time; + __le32 critical_comp_time; + __le16 temp_sensor[8]; + __u8 rsvd216[296]; +}; + +struct nvme_fw_slot_info_log { + __u8 afi; + __u8 rsvd1[7]; + __le64 frs[7]; + __u8 rsvd64[448]; +}; + +enum { + NVME_CMD_EFFECTS_CSUPP = 1 << 0, + NVME_CMD_EFFECTS_LBCC = 1 << 1, + NVME_CMD_EFFECTS_NCC = 1 << 2, + NVME_CMD_EFFECTS_NIC = 1 << 3, + NVME_CMD_EFFECTS_CCC = 1 << 4, + NVME_CMD_EFFECTS_CSE_MASK = 3 << 16, +}; + +struct nvme_effects_log { + __le32 acs[256]; + __le32 iocs[256]; + __u8 resv[2048]; +}; + +enum nvme_ana_state { + NVME_ANA_OPTIMIZED = 0x01, + NVME_ANA_NONOPTIMIZED = 0x02, + NVME_ANA_INACCESSIBLE = 0x03, + NVME_ANA_PERSISTENT_LOSS = 0x04, + NVME_ANA_CHANGE = 0x0f, +}; + +struct nvme_ana_group_desc { + __le32 grpid; + __le32 nnsids; + __le64 chgcnt; + __u8 state; + __u8 rsvd17[15]; + __le32 nsids[]; +}; + +/* flag for the log specific field of the ANA log */ +#define NVME_ANA_LOG_RGO (1 << 0) + +struct nvme_ana_rsp_hdr { + __le64 chgcnt; + __le16 ngrps; + __le16 rsvd10[3]; +}; + +enum { + NVME_SMART_CRIT_SPARE = 1 << 0, + NVME_SMART_CRIT_TEMPERATURE = 1 << 1, + NVME_SMART_CRIT_RELIABILITY = 1 << 2, + NVME_SMART_CRIT_MEDIA = 1 << 3, + NVME_SMART_CRIT_VOLATILE_MEMORY = 1 << 4, +}; + +enum { + NVME_AER_ERROR = 0, + NVME_AER_SMART = 1, + NVME_AER_NOTICE = 2, + NVME_AER_CSS = 6, + NVME_AER_VS = 7, +}; + +enum { + NVME_AER_NOTICE_NS_CHANGED = 0x00, + NVME_AER_NOTICE_FW_ACT_STARTING = 0x01, + NVME_AER_NOTICE_ANA = 0x03, +}; + +enum { + NVME_AEN_CFG_NS_ATTR = 1 << 8, + NVME_AEN_CFG_FW_ACT = 1 << 9, + NVME_AEN_CFG_ANA_CHANGE = 1 << 11, +}; + +struct nvme_lba_range_type { + __u8 type; + __u8 attributes; + __u8 rsvd2[14]; + __u64 slba; + __u64 nlb; + __u8 guid[16]; + __u8 rsvd48[16]; +}; + +enum { + NVME_LBART_TYPE_FS = 0x01, + NVME_LBART_TYPE_RAID = 0x02, + NVME_LBART_TYPE_CACHE = 0x03, + NVME_LBART_TYPE_SWAP = 0x04, + + NVME_LBART_ATTRIB_TEMP = 1 << 0, + NVME_LBART_ATTRIB_HIDE = 1 << 1, +}; + +struct nvme_reservation_status { + __le32 gen; + __u8 rtype; + __u8 regctl[2]; + __u8 resv5[2]; + __u8 ptpls; + __u8 resv10[13]; + struct { + __le16 cntlid; + __u8 rcsts; + __u8 resv3[5]; + __le64 hostid; + __le64 rkey; + } regctl_ds[]; +}; + +enum nvme_async_event_type { + NVME_AER_TYPE_ERROR = 0, + NVME_AER_TYPE_SMART = 1, + NVME_AER_TYPE_NOTICE = 2, +}; + +/* I/O commands */ + +enum nvme_opcode { + nvme_cmd_flush = 0x00, + nvme_cmd_write = 0x01, + nvme_cmd_read = 0x02, + nvme_cmd_write_uncor = 0x04, + nvme_cmd_compare = 0x05, + nvme_cmd_write_zeroes = 0x08, + nvme_cmd_dsm = 0x09, + nvme_cmd_resv_register = 0x0d, + nvme_cmd_resv_report = 0x0e, + nvme_cmd_resv_acquire = 0x11, + nvme_cmd_resv_release = 0x15, +}; + +/* + * Descriptor subtype - lower 4 bits of nvme_(keyed_)sgl_desc identifier + * + * @NVME_SGL_FMT_ADDRESS: absolute address of the data block + * @NVME_SGL_FMT_OFFSET: relative offset of the in-capsule data block + * @NVME_SGL_FMT_TRANSPORT_A: transport defined format, value 0xA + * @NVME_SGL_FMT_INVALIDATE: RDMA transport specific remote invalidation + * request subtype + */ +enum { + NVME_SGL_FMT_ADDRESS = 0x00, + NVME_SGL_FMT_OFFSET = 0x01, + NVME_SGL_FMT_TRANSPORT_A = 0x0A, + NVME_SGL_FMT_INVALIDATE = 0x0f, +}; + +/* + * Descriptor type - upper 4 bits of nvme_(keyed_)sgl_desc identifier + * + * For struct nvme_sgl_desc: + * @NVME_SGL_FMT_DATA_DESC: data block descriptor + * @NVME_SGL_FMT_SEG_DESC: sgl segment descriptor + * @NVME_SGL_FMT_LAST_SEG_DESC: last sgl segment descriptor + * + * For struct nvme_keyed_sgl_desc: + * @NVME_KEY_SGL_FMT_DATA_DESC: keyed data block descriptor + * + * Transport-specific SGL types: + * @NVME_TRANSPORT_SGL_DATA_DESC: Transport SGL data dlock descriptor + */ +enum { + NVME_SGL_FMT_DATA_DESC = 0x00, + NVME_SGL_FMT_SEG_DESC = 0x02, + NVME_SGL_FMT_LAST_SEG_DESC = 0x03, + NVME_KEY_SGL_FMT_DATA_DESC = 0x04, + NVME_TRANSPORT_SGL_DATA_DESC = 0x05, +}; + +struct nvme_sgl_desc { + __le64 addr; + __le32 length; + __u8 rsvd[3]; + __u8 type; +}; + +struct nvme_keyed_sgl_desc { + __le64 addr; + __u8 length[3]; + __u8 key[4]; + __u8 type; +}; + +union nvme_data_ptr { + struct { + __le64 prp1; + __le64 prp2; + }; + struct nvme_sgl_desc sgl; + struct nvme_keyed_sgl_desc ksgl; +}; + +/* + * Lowest two bits of our flags field (FUSE field in the spec): + * + * @NVME_CMD_FUSE_FIRST: Fused Operation, first command + * @NVME_CMD_FUSE_SECOND: Fused Operation, second command + * + * Highest two bits in our flags field (PSDT field in the spec): + * + * @NVME_CMD_PSDT_SGL_METABUF: Use SGLS for this transfer, + * If used, MPTR contains addr of single physical buffer (byte aligned). + * @NVME_CMD_PSDT_SGL_METASEG: Use SGLS for this transfer, + * If used, MPTR contains an address of an SGL segment containing + * exactly 1 SGL descriptor (qword aligned). + */ +enum { + NVME_CMD_FUSE_FIRST = (1 << 0), + NVME_CMD_FUSE_SECOND = (1 << 1), + + NVME_CMD_SGL_METABUF = (1 << 6), + NVME_CMD_SGL_METASEG = (1 << 7), + NVME_CMD_SGL_ALL = NVME_CMD_SGL_METABUF | NVME_CMD_SGL_METASEG, +}; + +struct nvme_common_command { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __le32 cdw2[2]; + __le64 metadata; + union nvme_data_ptr dptr; + __le32 cdw10[6]; +}; + +struct nvme_rw_command { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2; + __le64 metadata; + union nvme_data_ptr dptr; + __le64 slba; + __le16 length; + __le16 control; + __le32 dsmgmt; + __le32 reftag; + __le16 apptag; + __le16 appmask; +}; + +enum { + NVME_RW_LR = 1 << 15, + NVME_RW_FUA = 1 << 14, + NVME_RW_DSM_FREQ_UNSPEC = 0, + NVME_RW_DSM_FREQ_TYPICAL = 1, + NVME_RW_DSM_FREQ_RARE = 2, + NVME_RW_DSM_FREQ_READS = 3, + NVME_RW_DSM_FREQ_WRITES = 4, + NVME_RW_DSM_FREQ_RW = 5, + NVME_RW_DSM_FREQ_ONCE = 6, + NVME_RW_DSM_FREQ_PREFETCH = 7, + NVME_RW_DSM_FREQ_TEMP = 8, + NVME_RW_DSM_LATENCY_NONE = 0 << 4, + NVME_RW_DSM_LATENCY_IDLE = 1 << 4, + NVME_RW_DSM_LATENCY_NORM = 2 << 4, + NVME_RW_DSM_LATENCY_LOW = 3 << 4, + NVME_RW_DSM_SEQ_REQ = 1 << 6, + NVME_RW_DSM_COMPRESSED = 1 << 7, + NVME_RW_PRINFO_PRCHK_REF = 1 << 10, + NVME_RW_PRINFO_PRCHK_APP = 1 << 11, + NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12, + NVME_RW_PRINFO_PRACT = 1 << 13, + NVME_RW_DTYPE_STREAMS = 1 << 4, +}; + +struct nvme_dsm_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[2]; + union nvme_data_ptr dptr; + __le32 nr; + __le32 attributes; + __u32 rsvd12[4]; +}; + +enum { + NVME_DSMGMT_IDR = 1 << 0, + NVME_DSMGMT_IDW = 1 << 1, + NVME_DSMGMT_AD = 1 << 2, +}; + +#define NVME_DSM_MAX_RANGES 256 + +struct nvme_dsm_range { + __le32 cattr; + __le32 nlb; + __le64 slba; +}; + +struct nvme_write_zeroes_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2; + __le64 metadata; + union nvme_data_ptr dptr; + __le64 slba; + __le16 length; + __le16 control; + __le32 dsmgmt; + __le32 reftag; + __le16 apptag; + __le16 appmask; +}; + +/* Features */ + +struct nvme_feat_auto_pst { + __le64 entries[32]; +}; + +enum { + NVME_HOST_MEM_ENABLE = (1 << 0), + NVME_HOST_MEM_RETURN = (1 << 1), +}; + +/* Admin commands */ + +enum nvme_admin_opcode { + nvme_admin_delete_sq = 0x00, + nvme_admin_create_sq = 0x01, + nvme_admin_get_log_page = 0x02, + nvme_admin_delete_cq = 0x04, + nvme_admin_create_cq = 0x05, + nvme_admin_identify = 0x06, + nvme_admin_abort_cmd = 0x08, + nvme_admin_set_features = 0x09, + nvme_admin_get_features = 0x0a, + nvme_admin_async_event = 0x0c, + nvme_admin_ns_mgmt = 0x0d, + nvme_admin_activate_fw = 0x10, + nvme_admin_download_fw = 0x11, + nvme_admin_ns_attach = 0x15, + nvme_admin_keep_alive = 0x18, + nvme_admin_directive_send = 0x19, + nvme_admin_directive_recv = 0x1a, + nvme_admin_dbbuf = 0x7C, + nvme_admin_format_nvm = 0x80, + nvme_admin_security_send = 0x81, + nvme_admin_security_recv = 0x82, + nvme_admin_sanitize_nvm = 0x84, +}; + +enum { + NVME_QUEUE_PHYS_CONTIG = (1 << 0), + NVME_CQ_IRQ_ENABLED = (1 << 1), + NVME_SQ_PRIO_URGENT = (0 << 1), + NVME_SQ_PRIO_HIGH = (1 << 1), + NVME_SQ_PRIO_MEDIUM = (2 << 1), + NVME_SQ_PRIO_LOW = (3 << 1), + NVME_FEAT_ARBITRATION = 0x01, + NVME_FEAT_POWER_MGMT = 0x02, + NVME_FEAT_LBA_RANGE = 0x03, + NVME_FEAT_TEMP_THRESH = 0x04, + NVME_FEAT_ERR_RECOVERY = 0x05, + NVME_FEAT_VOLATILE_WC = 0x06, + NVME_FEAT_NUM_QUEUES = 0x07, + NVME_FEAT_IRQ_COALESCE = 0x08, + NVME_FEAT_IRQ_CONFIG = 0x09, + NVME_FEAT_WRITE_ATOMIC = 0x0a, + NVME_FEAT_ASYNC_EVENT = 0x0b, + NVME_FEAT_AUTO_PST = 0x0c, + NVME_FEAT_HOST_MEM_BUF = 0x0d, + NVME_FEAT_TIMESTAMP = 0x0e, + NVME_FEAT_KATO = 0x0f, + NVME_FEAT_HCTM = 0x10, + NVME_FEAT_NOPSC = 0x11, + NVME_FEAT_RRL = 0x12, + NVME_FEAT_PLM_CONFIG = 0x13, + NVME_FEAT_PLM_WINDOW = 0x14, + NVME_FEAT_SW_PROGRESS = 0x80, + NVME_FEAT_HOST_ID = 0x81, + NVME_FEAT_RESV_MASK = 0x82, + NVME_FEAT_RESV_PERSIST = 0x83, + NVME_FEAT_WRITE_PROTECT = 0x84, + NVME_LOG_ERROR = 0x01, + NVME_LOG_SMART = 0x02, + NVME_LOG_FW_SLOT = 0x03, + NVME_LOG_CHANGED_NS = 0x04, + NVME_LOG_CMD_EFFECTS = 0x05, + NVME_LOG_ANA = 0x0c, + NVME_LOG_DISC = 0x70, + NVME_LOG_RESERVATION = 0x80, + NVME_FWACT_REPL = (0 << 3), + NVME_FWACT_REPL_ACTV = (1 << 3), + NVME_FWACT_ACTV = (2 << 3), +}; + +/* NVMe Namespace Write Protect State */ +enum { + NVME_NS_NO_WRITE_PROTECT = 0, + NVME_NS_WRITE_PROTECT, + NVME_NS_WRITE_PROTECT_POWER_CYCLE, + NVME_NS_WRITE_PROTECT_PERMANENT, +}; + +#define NVME_MAX_CHANGED_NAMESPACES 1024 + +struct nvme_identify { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[2]; + union nvme_data_ptr dptr; + __u8 cns; + __u8 rsvd3; + __le16 ctrlid; + __u32 rsvd11[5]; +}; + +#define NVME_IDENTIFY_DATA_SIZE 4096 + +struct nvme_features { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[2]; + union nvme_data_ptr dptr; + __le32 fid; + __le32 dword11; + __le32 dword12; + __le32 dword13; + __le32 dword14; + __le32 dword15; +}; + +struct nvme_host_mem_buf_desc { + __le64 addr; + __le32 size; + __u32 rsvd; +}; + +struct nvme_create_cq { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[5]; + __le64 prp1; + __u64 rsvd8; + __le16 cqid; + __le16 qsize; + __le16 cq_flags; + __le16 irq_vector; + __u32 rsvd12[4]; +}; + +struct nvme_create_sq { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[5]; + __le64 prp1; + __u64 rsvd8; + __le16 sqid; + __le16 qsize; + __le16 sq_flags; + __le16 cqid; + __u32 rsvd12[4]; +}; + +struct nvme_delete_queue { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[9]; + __le16 qid; + __u16 rsvd10; + __u32 rsvd11[5]; +}; + +struct nvme_abort_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[9]; + __le16 sqid; + __u16 cid; + __u32 rsvd11[5]; +}; + +struct nvme_download_firmware { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[5]; + union nvme_data_ptr dptr; + __le32 numd; + __le32 offset; + __u32 rsvd12[4]; +}; + +struct nvme_format_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[4]; + __le32 cdw10; + __u32 rsvd11[5]; +}; + +struct nvme_get_log_page_command { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[2]; + union nvme_data_ptr dptr; + __u8 lid; + __u8 lsp; /* upper 4 bits reserved */ + __le16 numdl; + __le16 numdu; + __u16 rsvd11; + __le32 lpol; + __le32 lpou; + __u32 rsvd14[2]; +}; + +struct nvme_directive_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[2]; + union nvme_data_ptr dptr; + __le32 numd; + __u8 doper; + __u8 dtype; + __le16 dspec; + __u8 endir; + __u8 tdtype; + __u16 rsvd15; + + __u32 rsvd16[3]; +}; + +/* + * Fabrics subcommands. + */ +enum nvmf_fabrics_opcode { + nvme_fabrics_command = 0x7f, +}; + +enum nvmf_capsule_command { + nvme_fabrics_type_property_set = 0x00, + nvme_fabrics_type_connect = 0x01, + nvme_fabrics_type_property_get = 0x04, +}; + +struct nvmf_common_command { + __u8 opcode; + __u8 resv1; + __u16 command_id; + __u8 fctype; + __u8 resv2[35]; + __u8 ts[24]; +}; + +/* + * The legal cntlid range a NVMe Target will provide. + * Note that cntlid of value 0 is considered illegal in the fabrics world. + * Devices based on earlier specs did not have the subsystem concept; + * therefore, those devices had their cntlid value set to 0 as a result. + */ +#define NVME_CNTLID_MIN 1 +#define NVME_CNTLID_MAX 0xffef +#define NVME_CNTLID_DYNAMIC 0xffff + +#define MAX_DISC_LOGS 255 + +/* Discovery log page entry */ +struct nvmf_disc_rsp_page_entry { + __u8 trtype; + __u8 adrfam; + __u8 subtype; + __u8 treq; + __le16 portid; + __le16 cntlid; + __le16 asqsz; + __u8 resv8[22]; + char trsvcid[NVMF_TRSVCID_SIZE]; + __u8 resv64[192]; + char subnqn[NVMF_NQN_FIELD_LEN]; + char traddr[NVMF_TRADDR_SIZE]; + union tsas { + char common[NVMF_TSAS_SIZE]; + struct rdma { + __u8 qptype; + __u8 prtype; + __u8 cms; + __u8 resv3[5]; + __u16 pkey; + __u8 resv10[246]; + } rdma; + } tsas; +}; + +/* Discovery log page header */ +struct nvmf_disc_rsp_page_hdr { + __le64 genctr; + __le64 numrec; + __le16 recfmt; + __u8 resv14[1006]; + struct nvmf_disc_rsp_page_entry entries[0]; +}; + +struct nvmf_connect_command { + __u8 opcode; + __u8 resv1; + __u16 command_id; + __u8 fctype; + __u8 resv2[19]; + union nvme_data_ptr dptr; + __le16 recfmt; + __le16 qid; + __le16 sqsize; + __u8 cattr; + __u8 resv3; + __le32 kato; + __u8 resv4[12]; +}; + +struct nvmf_connect_data { + uuid_t hostid; + __le16 cntlid; + char resv4[238]; + char subsysnqn[NVMF_NQN_FIELD_LEN]; + char hostnqn[NVMF_NQN_FIELD_LEN]; + char resv5[256]; +}; + +struct nvmf_property_set_command { + __u8 opcode; + __u8 resv1; + __u16 command_id; + __u8 fctype; + __u8 resv2[35]; + __u8 attrib; + __u8 resv3[3]; + __le32 offset; + __le64 value; + __u8 resv4[8]; +}; + +struct nvmf_property_get_command { + __u8 opcode; + __u8 resv1; + __u16 command_id; + __u8 fctype; + __u8 resv2[35]; + __u8 attrib; + __u8 resv3[3]; + __le32 offset; + __u8 resv4[16]; +}; + +struct nvme_dbbuf { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[5]; + __le64 prp1; + __le64 prp2; + __u32 rsvd12[6]; +}; + +struct streams_directive_params { + __le16 msl; + __le16 nssa; + __le16 nsso; + __u8 rsvd[10]; + __le32 sws; + __le16 sgs; + __le16 nsa; + __le16 nso; + __u8 rsvd2[6]; +}; + +struct nvme_command { + union { + struct nvme_common_command common; + struct nvme_rw_command rw; + struct nvme_identify identify; + struct nvme_features features; + struct nvme_create_cq create_cq; + struct nvme_create_sq create_sq; + struct nvme_delete_queue delete_queue; + struct nvme_download_firmware dlfw; + struct nvme_format_cmd format; + struct nvme_dsm_cmd dsm; + struct nvme_write_zeroes_cmd write_zeroes; + struct nvme_abort_cmd abort; + struct nvme_get_log_page_command get_log_page; + struct nvmf_common_command fabrics; + struct nvmf_connect_command connect; + struct nvmf_property_set_command prop_set; + struct nvmf_property_get_command prop_get; + struct nvme_dbbuf dbbuf; + struct nvme_directive_cmd directive; + }; +}; + +static inline bool nvme_is_write(struct nvme_command *cmd) +{ + /* + * What a mess... + * + * Why can't we simply have a Fabrics In and Fabrics out command? + */ + if (unlikely(cmd->common.opcode == nvme_fabrics_command)) + return cmd->fabrics.fctype & 1; + return cmd->common.opcode & 1; +} + +enum { + /* + * Generic Command Status: + */ + NVME_SC_SUCCESS = 0x0, + NVME_SC_INVALID_OPCODE = 0x1, + NVME_SC_INVALID_FIELD = 0x2, + NVME_SC_CMDID_CONFLICT = 0x3, + NVME_SC_DATA_XFER_ERROR = 0x4, + NVME_SC_POWER_LOSS = 0x5, + NVME_SC_INTERNAL = 0x6, + NVME_SC_ABORT_REQ = 0x7, + NVME_SC_ABORT_QUEUE = 0x8, + NVME_SC_FUSED_FAIL = 0x9, + NVME_SC_FUSED_MISSING = 0xa, + NVME_SC_INVALID_NS = 0xb, + NVME_SC_CMD_SEQ_ERROR = 0xc, + NVME_SC_SGL_INVALID_LAST = 0xd, + NVME_SC_SGL_INVALID_COUNT = 0xe, + NVME_SC_SGL_INVALID_DATA = 0xf, + NVME_SC_SGL_INVALID_METADATA = 0x10, + NVME_SC_SGL_INVALID_TYPE = 0x11, + + NVME_SC_SGL_INVALID_OFFSET = 0x16, + NVME_SC_SGL_INVALID_SUBTYPE = 0x17, + + NVME_SC_NS_WRITE_PROTECTED = 0x20, + + NVME_SC_LBA_RANGE = 0x80, + NVME_SC_CAP_EXCEEDED = 0x81, + NVME_SC_NS_NOT_READY = 0x82, + NVME_SC_RESERVATION_CONFLICT = 0x83, + + /* + * Command Specific Status: + */ + NVME_SC_CQ_INVALID = 0x100, + NVME_SC_QID_INVALID = 0x101, + NVME_SC_QUEUE_SIZE = 0x102, + NVME_SC_ABORT_LIMIT = 0x103, + NVME_SC_ABORT_MISSING = 0x104, + NVME_SC_ASYNC_LIMIT = 0x105, + NVME_SC_FIRMWARE_SLOT = 0x106, + NVME_SC_FIRMWARE_IMAGE = 0x107, + NVME_SC_INVALID_VECTOR = 0x108, + NVME_SC_INVALID_LOG_PAGE = 0x109, + NVME_SC_INVALID_FORMAT = 0x10a, + NVME_SC_FW_NEEDS_CONV_RESET = 0x10b, + NVME_SC_INVALID_QUEUE = 0x10c, + NVME_SC_FEATURE_NOT_SAVEABLE = 0x10d, + NVME_SC_FEATURE_NOT_CHANGEABLE = 0x10e, + NVME_SC_FEATURE_NOT_PER_NS = 0x10f, + NVME_SC_FW_NEEDS_SUBSYS_RESET = 0x110, + NVME_SC_FW_NEEDS_RESET = 0x111, + NVME_SC_FW_NEEDS_MAX_TIME = 0x112, + NVME_SC_FW_ACIVATE_PROHIBITED = 0x113, + NVME_SC_OVERLAPPING_RANGE = 0x114, + NVME_SC_NS_INSUFFICENT_CAP = 0x115, + NVME_SC_NS_ID_UNAVAILABLE = 0x116, + NVME_SC_NS_ALREADY_ATTACHED = 0x118, + NVME_SC_NS_IS_PRIVATE = 0x119, + NVME_SC_NS_NOT_ATTACHED = 0x11a, + NVME_SC_THIN_PROV_NOT_SUPP = 0x11b, + NVME_SC_CTRL_LIST_INVALID = 0x11c, + + /* + * I/O Command Set Specific - NVM commands: + */ + NVME_SC_BAD_ATTRIBUTES = 0x180, + NVME_SC_INVALID_PI = 0x181, + NVME_SC_READ_ONLY = 0x182, + NVME_SC_ONCS_NOT_SUPPORTED = 0x183, + + /* + * I/O Command Set Specific - Fabrics commands: + */ + NVME_SC_CONNECT_FORMAT = 0x180, + NVME_SC_CONNECT_CTRL_BUSY = 0x181, + NVME_SC_CONNECT_INVALID_PARAM = 0x182, + NVME_SC_CONNECT_RESTART_DISC = 0x183, + NVME_SC_CONNECT_INVALID_HOST = 0x184, + + NVME_SC_DISCOVERY_RESTART = 0x190, + NVME_SC_AUTH_REQUIRED = 0x191, + + /* + * Media and Data Integrity Errors: + */ + NVME_SC_WRITE_FAULT = 0x280, + NVME_SC_READ_ERROR = 0x281, + NVME_SC_GUARD_CHECK = 0x282, + NVME_SC_APPTAG_CHECK = 0x283, + NVME_SC_REFTAG_CHECK = 0x284, + NVME_SC_COMPARE_FAILED = 0x285, + NVME_SC_ACCESS_DENIED = 0x286, + NVME_SC_UNWRITTEN_BLOCK = 0x287, + + /* + * Path-related Errors: + */ + NVME_SC_ANA_PERSISTENT_LOSS = 0x301, + NVME_SC_ANA_INACCESSIBLE = 0x302, + NVME_SC_ANA_TRANSITION = 0x303, + NVME_SC_HOST_PATH_ERROR = 0x370, + + NVME_SC_DNR = 0x4000, +}; + +struct nvme_completion { + /* + * Used by Admin and Fabrics commands to return data: + */ + union nvme_result { + __le16 u16; + __le32 u32; + __le64 u64; + } result; + __le16 sq_head; /* how much of this queue may be reclaimed */ + __le16 sq_id; /* submission queue that generated this entry */ + __u16 command_id; /* of the command which completed */ + __le16 status; /* did the command fail, and if so, why? */ +}; + +#define NVME_VS(major, minor, tertiary) \ + (((major) << 16) | ((minor) << 8) | (tertiary)) + +#define NVME_MAJOR(ver) ((ver) >> 16) +#define NVME_MINOR(ver) (((ver) >> 8) & 0xff) +#define NVME_TERTIARY(ver) ((ver) & 0xff) + +#endif /* _LINUX_NVME_H */