From 8b6c254c3b623d8e44d63b31ffff1bbf6d5b5aca Mon Sep 17 00:00:00 2001 From: Daniel Borca Date: Mon, 14 Nov 2016 03:03:46 +0200 Subject: [PATCH 1/6] separate disk backend ops from block_if logic --- Makefile | 5 + include/xhyve/vdsk/vdsk-int.h | 44 ++++ include/xhyve/vdsk/vdsk-raw.h | 31 +++ include/xhyve/vdsk/vdsk.h | 53 +++++ src/block_if.c | 339 +++------------------------- src/vdsk/vdsk-raw.c | 400 ++++++++++++++++++++++++++++++++++ src/vdsk/vdsk.c | 166 ++++++++++++++ 7 files changed, 725 insertions(+), 313 deletions(-) create mode 100644 include/xhyve/vdsk/vdsk-int.h create mode 100644 include/xhyve/vdsk/vdsk-raw.h create mode 100644 include/xhyve/vdsk/vdsk.h create mode 100644 src/vdsk/vdsk-raw.c create mode 100644 src/vdsk/vdsk.c diff --git a/Makefile b/Makefile index 3294494..8aa8e2c 100644 --- a/Makefile +++ b/Makefile @@ -67,9 +67,14 @@ FIRMWARE_SRC := \ src/firmware/kexec.c \ src/firmware/fbsd.c +VDSK_SRC := \ + src/vdsk/vdsk-raw.c \ + src/vdsk/vdsk.c + SRC := \ $(VMM_SRC) \ $(XHYVE_SRC) \ + $(VDSK_SRC) \ $(FIRMWARE_SRC) OBJ := $(SRC:src/%.c=build/%.o) diff --git a/include/xhyve/vdsk/vdsk-int.h b/include/xhyve/vdsk/vdsk-int.h new file mode 100644 index 0000000..77ef876 --- /dev/null +++ b/include/xhyve/vdsk/vdsk-int.h @@ -0,0 +1,44 @@ +/*- + * Copyright (c) 2016 Daniel Borca + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +struct vdsk { + int (*close)(struct vdsk *vdsk); + int (*read)(const struct vdsk *vdsk, struct blockif_req *br, uint8_t *buf); + int (*write)(const struct vdsk *vdsk, struct blockif_req *br, uint8_t *buf); + int (*flush)(const struct vdsk *vdsk); + int (*delete)(const struct vdsk *vdsk, struct blockif_req *br); + int bc_isgeom; + int bc_candelete; + int bc_rdonly; + int bc_sectsz; + int bc_psectsz; + int bc_psectoff; + off_t bc_size; +}; diff --git a/include/xhyve/vdsk/vdsk-raw.h b/include/xhyve/vdsk/vdsk-raw.h new file mode 100644 index 0000000..4babcf8 --- /dev/null +++ b/include/xhyve/vdsk/vdsk-raw.h @@ -0,0 +1,31 @@ +/*- + * Copyright (c) 2016 Daniel Borca + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +struct vdsk *vdsk_raw_open(const char *optstr, int numthr); diff --git a/include/xhyve/vdsk/vdsk.h b/include/xhyve/vdsk/vdsk.h new file mode 100644 index 0000000..a215576 --- /dev/null +++ b/include/xhyve/vdsk/vdsk.h @@ -0,0 +1,53 @@ +/*- + * Copyright (c) 2016 Daniel Borca + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include +#include +#include + +struct vdsk; + +struct vdsk *vdsk_open(const char *optstr, int numthr); + +int vdsk_close(struct vdsk *vdsk); +int vdsk_read(const struct vdsk *vdsk, struct blockif_req *br, uint8_t *buf); +int vdsk_write(const struct vdsk *vdsk, struct blockif_req *br, uint8_t *buf); +int vdsk_flush(const struct vdsk *vdsk); +int vdsk_delete(const struct vdsk *vdsk, struct blockif_req *br); + +void vdsk_chs(const struct vdsk *vdsk, uint16_t *c, uint8_t *h, uint8_t *s); + +off_t vdsk_size(const struct vdsk *vdsk); +int vdsk_sectsz(const struct vdsk *vdsk); +void vdsk_psectsz(const struct vdsk *vdsk, int *size, int *off); +int vdsk_is_ro(const struct vdsk *vdsk); +int vdsk_candelete(const struct vdsk *vdsk); + +uint8_t *vdsk_physbuf(const struct vdsk *vdsk); diff --git a/src/block_if.c b/src/block_if.c index 4d5df95..df5cac1 100644 --- a/src/block_if.c +++ b/src/block_if.c @@ -1,6 +1,7 @@ /*- * Copyright (c) 2013 Peter Grehan * Copyright (c) 2015 xhyve developers + * Copyright (c) 2016 Daniel Borca * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -47,6 +48,7 @@ #include #include #include +#include #define BLOCKIF_SIG 0xb109b109 /* xhyve: FIXME @@ -88,16 +90,8 @@ struct blockif_elem { struct blockif_ctxt { int bc_magic; - int bc_fd; - int bc_ischr; - int bc_isgeom; - int bc_candelete; - int bc_rdonly; - off_t bc_size; - int bc_sectsz; - int bc_psectsz; - int bc_psectoff; int bc_closing; + struct vdsk *bc_vdsk; pthread_t bc_btid[BLOCKIF_NUMTHR]; pthread_mutex_t bc_mtx; pthread_cond_t bc_cond; @@ -121,26 +115,6 @@ static struct blockif_sig_elem *blockif_bse_head; #pragma clang diagnostic pop -static ssize_t -preadv(int fd, const struct iovec *iov, int iovcnt, off_t offset) -{ - off_t res; - - res = lseek(fd, offset, SEEK_SET); - assert(res == offset); - return readv(fd, iov, iovcnt); -} - -static ssize_t -pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset) -{ - off_t res; - - res = lseek(fd, offset, SEEK_SET); - assert(res == offset); - return writev(fd, iov, iovcnt); -} - static int blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, enum blockop op) @@ -228,9 +202,7 @@ static void blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) { struct blockif_req *br; - // off_t arg[2]; - ssize_t clen, len, off, boff, voff; - int i, err; + int err; br = be->be_req; if (br->br_iovcnt <= 1) @@ -238,105 +210,16 @@ blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) err = 0; switch (be->be_op) { case BOP_READ: - if (buf == NULL) { - if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, - br->br_offset)) < 0) - err = errno; - else - br->br_resid -= len; - break; - } - i = 0; - off = voff = 0; - while (br->br_resid > 0) { - len = MIN(br->br_resid, MAXPHYS); - if (pread(bc->bc_fd, buf, ((size_t) len), br->br_offset + off) < 0) - { - err = errno; - break; - } - boff = 0; - do { - clen = MIN((len - boff), - (((ssize_t) br->br_iov[i].iov_len) - voff)); - memcpy(((void *) (((uintptr_t) br->br_iov[i].iov_base) + - ((size_t) voff))), buf + boff, clen); - if (clen < (((ssize_t) br->br_iov[i].iov_len) - voff)) - voff += clen; - else { - i++; - voff = 0; - } - boff += clen; - } while (boff < len); - off += len; - br->br_resid -= len; - } + err = vdsk_read(bc->bc_vdsk, br, buf); break; case BOP_WRITE: - if (bc->bc_rdonly) { - err = EROFS; - break; - } - if (buf == NULL) { - if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, - br->br_offset)) < 0) - err = errno; - else - br->br_resid -= len; - break; - } - i = 0; - off = voff = 0; - while (br->br_resid > 0) { - len = MIN(br->br_resid, MAXPHYS); - boff = 0; - do { - clen = MIN((len - boff), - (((ssize_t) br->br_iov[i].iov_len) - voff)); - memcpy((buf + boff), - ((void *) (((uintptr_t) br->br_iov[i].iov_base) + - ((size_t) voff))), clen); - if (clen < (((ssize_t) br->br_iov[i].iov_len) - voff)) - voff += clen; - else { - i++; - voff = 0; - } - boff += clen; - } while (boff < len); - if (pwrite(bc->bc_fd, buf, ((size_t) len), br->br_offset + - off) < 0) { - err = errno; - break; - } - off += len; - br->br_resid -= len; - } + err = vdsk_write(bc->bc_vdsk, br, buf); break; case BOP_FLUSH: - if (bc->bc_ischr) { - if (ioctl(bc->bc_fd, DKIOCSYNCHRONIZECACHE)) - err = errno; - } else if (fsync(bc->bc_fd)) - err = errno; + err = vdsk_flush(bc->bc_vdsk); break; case BOP_DELETE: - if (!bc->bc_candelete) { - err = EOPNOTSUPP; - // } else if (bc->bc_rdonly) { - // err = EROFS; - // } else if (bc->bc_ischr) { - // arg[0] = br->br_offset; - // arg[1] = br->br_resid; - // if (ioctl(bc->bc_fd, DIOCGDELETE, arg)) { - // err = errno; - // } else { - // br->br_resid = 0; - // } - } else { - err = EOPNOTSUPP; - } + err = vdsk_delete(bc->bc_vdsk, br); break; } @@ -354,10 +237,7 @@ blockif_thr(void *arg) uint8_t *buf; bc = arg; - if (bc->bc_isgeom) - buf = malloc(MAXPHYS); - else - buf = NULL; + buf = vdsk_physbuf(bc->bc_vdsk); t = pthread_self(); pthread_mutex_lock(&bc->bc_mtx); @@ -417,149 +297,25 @@ blockif_init(void) struct blockif_ctxt * blockif_open(const char *optstr, UNUSED const char *ident) { - // char name[MAXPATHLEN]; - char *nopt, *xopts, *cp; struct blockif_ctxt *bc; - struct stat sbuf; - // struct diocgattr_arg arg; - off_t size, psectsz, psectoff; - int extra, fd, i, sectsz; - int nocache, sync, ro, candelete, geom, ssopt, pssopt; + struct vdsk *vdsk; + int i; pthread_once(&blockif_once, blockif_init); - fd = -1; - ssopt = 0; - nocache = 0; - sync = 0; - ro = 0; - - pssopt = 0; - /* - * The first element in the optstring is always a pathname. - * Optional elements follow - */ - nopt = xopts = strdup(optstr); - while (xopts != NULL) { - cp = strsep(&xopts, ","); - if (cp == nopt) /* file or device pathname */ - continue; - else if (!strcmp(cp, "nocache")) - nocache = 1; - else if (!strcmp(cp, "sync") || !strcmp(cp, "direct")) - sync = 1; - else if (!strcmp(cp, "ro")) - ro = 1; - else if (sscanf(cp, "sectorsize=%d/%d", &ssopt, &pssopt) == 2) - ; - else if (sscanf(cp, "sectorsize=%d", &ssopt) == 1) - pssopt = ssopt; - else { - fprintf(stderr, "Invalid device option \"%s\"\n", cp); - goto err; - } - } - - extra = 0; - if (nocache) { - perror("xhyve: nocache support unimplemented"); - goto err; - // extra |= O_DIRECT; - } - if (sync) - extra |= O_SYNC; - - fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra); - if (fd < 0 && !ro) { - /* Attempt a r/w fail with a r/o open */ - fd = open(nopt, O_RDONLY | extra); - ro = 1; - } - - if (fd < 0) { - perror("Could not open backing file"); - goto err; - } - - if (fstat(fd, &sbuf) < 0) { - perror("Could not stat backing file"); - goto err; - } - - /* - * Deal with raw devices - */ - size = sbuf.st_size; - sectsz = DEV_BSIZE; - psectsz = psectoff = 0; - candelete = geom = 0; - if (S_ISCHR(sbuf.st_mode)) { - perror("xhyve: raw device support unimplemented"); - goto err; - // if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || - // ioctl(fd, DIOCGSECTORSIZE, §sz)) - // { - // perror("Could not fetch dev blk/sector size"); - // goto err; - // } - // assert(size != 0); - // assert(sectsz != 0); - // if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) - // ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); - // strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); - // arg.len = sizeof(arg.value.i); - // if (ioctl(fd, DIOCGATTR, &arg) == 0) - // candelete = arg.value.i; - // if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0) - // geom = 1; - } else - psectsz = sbuf.st_blksize; - - if (ssopt != 0) { - if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 || - ssopt > pssopt) { - fprintf(stderr, "Invalid sector size %d/%d\n", - ssopt, pssopt); - goto err; - } - - // /* - // * Some backend drivers (e.g. cd0, ada0) require that the I/O - // * size be a multiple of the device's sector size. - // * - // * Validate that the emulated sector size complies with this - // * requirement. - // */ - // if (S_ISCHR(sbuf.st_mode)) { - // if (ssopt < sectsz || (ssopt % sectsz) != 0) { - // fprintf(stderr, "Sector size %d incompatible " - // "with underlying device sector size %d\n", - // ssopt, sectsz); - // goto err; - // } - // } - - sectsz = ssopt; - psectsz = pssopt; - psectoff = 0; - } - bc = calloc(1, sizeof(struct blockif_ctxt)); if (bc == NULL) { perror("calloc"); goto err; } + vdsk = vdsk_open(optstr, BLOCKIF_NUMTHR); + if (vdsk == NULL) { + goto err; + } + bc->bc_magic = (int) BLOCKIF_SIG; - bc->bc_fd = fd; - bc->bc_ischr = S_ISCHR(sbuf.st_mode); - bc->bc_isgeom = geom; - bc->bc_candelete = candelete; - bc->bc_rdonly = ro; - bc->bc_size = size; - bc->bc_sectsz = sectsz; - bc->bc_psectsz = (int) psectsz; - bc->bc_psectoff = (int) psectoff; + bc->bc_vdsk = vdsk; pthread_mutex_init(&bc->bc_mtx, NULL); pthread_cond_init(&bc->bc_cond, NULL); TAILQ_INIT(&bc->bc_freeq); @@ -576,8 +332,8 @@ blockif_open(const char *optstr, UNUSED const char *ident) return (bc); err: - if (fd >= 0) - close(fd); + if (bc != NULL) + free(bc); return (NULL); } @@ -741,59 +497,17 @@ blockif_close(struct blockif_ctxt *bc) * Release resources */ bc->bc_magic = 0; - close(bc->bc_fd); + vdsk_close(bc->bc_vdsk); free(bc); return (0); } -/* - * Return virtual C/H/S values for a given block. Use the algorithm - * outlined in the VHD specification to calculate values. - */ void blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) { - off_t sectors; /* total sectors of the block dev */ - off_t hcyl; /* cylinders times heads */ - uint16_t secpt; /* sectors per track */ - uint8_t heads; - assert(bc->bc_magic == ((int) BLOCKIF_SIG)); - - sectors = bc->bc_size / bc->bc_sectsz; - - /* Clamp the size to the largest possible with CHS */ - if (sectors > 65535LL*16*255) - sectors = 65535LL*16*255; - - if (sectors >= 65536LL*16*63) { - secpt = 255; - heads = 16; - hcyl = sectors / secpt; - } else { - secpt = 17; - hcyl = sectors / secpt; - heads = (uint8_t) ((hcyl + 1023) / 1024); - - if (heads < 4) - heads = 4; - - if (hcyl >= (heads * 1024) || heads > 16) { - secpt = 31; - heads = 16; - hcyl = sectors / secpt; - } - if (hcyl >= (heads * 1024)) { - secpt = 63; - heads = 16; - hcyl = sectors / secpt; - } - } - - *c = (uint16_t) (hcyl / heads); - *h = heads; - *s = (uint8_t) secpt; + vdsk_chs(bc->bc_vdsk, c, h, s); } /* @@ -803,22 +517,21 @@ off_t blockif_size(struct blockif_ctxt *bc) { assert(bc->bc_magic == ((int) BLOCKIF_SIG)); - return (bc->bc_size); + return vdsk_size(bc->bc_vdsk); } int blockif_sectsz(struct blockif_ctxt *bc) { assert(bc->bc_magic == ((int) BLOCKIF_SIG)); - return (bc->bc_sectsz); + return vdsk_sectsz(bc->bc_vdsk); } void blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) { assert(bc->bc_magic == ((int) BLOCKIF_SIG)); - *size = bc->bc_psectsz; - *off = bc->bc_psectoff; + vdsk_psectsz(bc->bc_vdsk, size, off); } int @@ -832,12 +545,12 @@ int blockif_is_ro(struct blockif_ctxt *bc) { assert(bc->bc_magic == ((int) BLOCKIF_SIG)); - return (bc->bc_rdonly); + return vdsk_is_ro(bc->bc_vdsk); } int blockif_candelete(struct blockif_ctxt *bc) { assert(bc->bc_magic == ((int) BLOCKIF_SIG)); - return (bc->bc_candelete); + return vdsk_candelete(bc->bc_vdsk); } diff --git a/src/vdsk/vdsk-raw.c b/src/vdsk/vdsk-raw.c new file mode 100644 index 0000000..964d8cc --- /dev/null +++ b/src/vdsk/vdsk-raw.c @@ -0,0 +1,400 @@ +/*- + * Copyright (c) 2013 Peter Grehan + * Copyright (c) 2015 xhyve developers + * Copyright (c) 2016 Daniel Borca + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +struct vdsk_raw_ctx { + struct vdsk super; + int bc_fd; + int bc_ischr; +}; + +/* xhyve: FIXME + * + * OS X does not support preadv/pwritev, we need to serialize reads and writes + * for the time being until we find a better solution. + */ + +static ssize_t +preadv(int fd, const struct iovec *iov, int iovcnt, off_t offset) +{ + off_t res; + + res = lseek(fd, offset, SEEK_SET); + assert(res == offset); + return readv(fd, iov, iovcnt); +} + +static ssize_t +pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset) +{ + off_t res; + + res = lseek(fd, offset, SEEK_SET); + assert(res == offset); + return writev(fd, iov, iovcnt); +} + +static int +disk_close(struct vdsk *vdsk) +{ + struct vdsk_raw_ctx *vp = (struct vdsk_raw_ctx *)vdsk; + + close(vp->bc_fd); + free(vp); + + return (0); +} + +static int +disk_read(const struct vdsk *vdsk, struct blockif_req *br, uint8_t *buf) +{ + const struct vdsk_raw_ctx *vp = (const struct vdsk_raw_ctx *)vdsk; + + ssize_t clen, len, off, boff, voff; + int i, err; + + err = 0; + + if (buf == NULL) { + if ((len = preadv(vp->bc_fd, br->br_iov, br->br_iovcnt, + br->br_offset)) < 0) + err = errno; + else + br->br_resid -= len; + return err; + } + i = 0; + off = voff = 0; + while (br->br_resid > 0) { + len = MIN(br->br_resid, MAXPHYS); + if (pread(vp->bc_fd, buf, ((size_t) len), br->br_offset + off) < 0) + { + err = errno; + break; + } + boff = 0; + do { + clen = MIN((len - boff), + (((ssize_t) br->br_iov[i].iov_len) - voff)); + memcpy(((void *) (((uintptr_t) br->br_iov[i].iov_base) + + ((size_t) voff))), buf + boff, clen); + if (clen < (((ssize_t) br->br_iov[i].iov_len) - voff)) + voff += clen; + else { + i++; + voff = 0; + } + boff += clen; + } while (boff < len); + off += len; + br->br_resid -= len; + } + + return err; +} + +static int +disk_write(const struct vdsk *vdsk, struct blockif_req *br, uint8_t *buf) +{ + const struct vdsk_raw_ctx *vp = (const struct vdsk_raw_ctx *)vdsk; + + ssize_t clen, len, off, boff, voff; + int i, err; + + err = 0; + + if (vdsk->bc_rdonly) { + err = EROFS; + return err; + } + if (buf == NULL) { + if ((len = pwritev(vp->bc_fd, br->br_iov, br->br_iovcnt, + br->br_offset)) < 0) + err = errno; + else + br->br_resid -= len; + return err; + } + i = 0; + off = voff = 0; + while (br->br_resid > 0) { + len = MIN(br->br_resid, MAXPHYS); + boff = 0; + do { + clen = MIN((len - boff), + (((ssize_t) br->br_iov[i].iov_len) - voff)); + memcpy((buf + boff), + ((void *) (((uintptr_t) br->br_iov[i].iov_base) + + ((size_t) voff))), clen); + if (clen < (((ssize_t) br->br_iov[i].iov_len) - voff)) + voff += clen; + else { + i++; + voff = 0; + } + boff += clen; + } while (boff < len); + if (pwrite(vp->bc_fd, buf, ((size_t) len), br->br_offset + + off) < 0) { + err = errno; + break; + } + off += len; + br->br_resid -= len; + } + + return err; +} + +static int +disk_flush(const struct vdsk *vdsk) +{ + const struct vdsk_raw_ctx *vp = (const struct vdsk_raw_ctx *)vdsk; + + int err = 0; + + if (vp->bc_ischr) { + if (ioctl(vp->bc_fd, DKIOCSYNCHRONIZECACHE)) + err = errno; + } else if (fsync(vp->bc_fd)) + err = errno; + + return err; +} + +static int +disk_delete(const struct vdsk *vdsk, UNUSED struct blockif_req *br) +{ + // const struct vdsk_raw_ctx *vp = (const struct vdsk_raw_ctx *)vdsk; + + // off_t arg[2]; + int err = 0; + + if (!vdsk->bc_candelete) { + err = EOPNOTSUPP; + // } else if (vdsk->bc_rdonly) { + // err = EROFS; + // } else if (vp->bc_ischr) { + // arg[0] = br->br_offset; + // arg[1] = br->br_resid; + // if (ioctl(vp->bc_fd, DIOCGDELETE, arg)) { + // err = errno; + // } else { + // br->br_resid = 0; + // } + } else { + err = EOPNOTSUPP; + } + + return err; +} + +struct vdsk * +vdsk_raw_open(const char *optstr, int numthr) +{ + // char name[MAXPATHLEN]; + char *nopt, *xopts, *cp; + struct vdsk_raw_ctx *bc; + struct stat sbuf; + // struct diocgattr_arg arg; + off_t size, psectsz, psectoff; + int extra, fd, sectsz; + int nocache, sync, ro, candelete, geom, ssopt, pssopt; + + assert(numthr == 1); + + fd = -1; + ssopt = 0; + nocache = 0; + sync = 0; + ro = 0; + + pssopt = 0; + /* + * The first element in the optstring is always a pathname. + * Optional elements follow + */ + nopt = xopts = strdup(optstr); + while (xopts != NULL) { + cp = strsep(&xopts, ","); + if (cp == nopt) /* file or device pathname */ + continue; + else if (!strcmp(cp, "nocache")) + nocache = 1; + else if (!strcmp(cp, "sync") || !strcmp(cp, "direct")) + sync = 1; + else if (!strcmp(cp, "ro")) + ro = 1; + else if (sscanf(cp, "sectorsize=%d/%d", &ssopt, &pssopt) == 2) + ; + else if (sscanf(cp, "sectorsize=%d", &ssopt) == 1) + pssopt = ssopt; + else { + fprintf(stderr, "Invalid device option \"%s\"\n", cp); + goto err; + } + } + + extra = 0; + if (nocache) { + perror("xhyve: nocache support unimplemented"); + goto err; + // extra |= O_DIRECT; + } + if (sync) + extra |= O_SYNC; + + fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra); + if (fd < 0 && !ro) { + /* Attempt a r/w fail with a r/o open */ + fd = open(nopt, O_RDONLY | extra); + ro = 1; + } + + if (fd < 0) { + perror("Could not open backing file"); + goto err; + } + + if (fstat(fd, &sbuf) < 0) { + perror("Could not stat backing file"); + goto err; + } + + /* + * Deal with raw devices + */ + size = sbuf.st_size; + sectsz = DEV_BSIZE; + psectsz = psectoff = 0; + candelete = geom = 0; + if (S_ISCHR(sbuf.st_mode)) { + perror("xhyve: raw device support unimplemented"); + goto err; + // if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || + // ioctl(fd, DIOCGSECTORSIZE, §sz)) + // { + // perror("Could not fetch dev blk/sector size"); + // goto err; + // } + // assert(size != 0); + // assert(sectsz != 0); + // if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) + // ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); + // strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); + // arg.len = sizeof(arg.value.i); + // if (ioctl(fd, DIOCGATTR, &arg) == 0) + // candelete = arg.value.i; + // if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0) + // geom = 1; + } else + psectsz = sbuf.st_blksize; + + if (ssopt != 0) { + if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 || + ssopt > pssopt) { + fprintf(stderr, "Invalid sector size %d/%d\n", + ssopt, pssopt); + goto err; + } + + // /* + // * Some backend drivers (e.g. cd0, ada0) require that the I/O + // * size be a multiple of the device's sector size. + // * + // * Validate that the emulated sector size complies with this + // * requirement. + // */ + // if (S_ISCHR(sbuf.st_mode)) { + // if (ssopt < sectsz || (ssopt % sectsz) != 0) { + // fprintf(stderr, "Sector size %d incompatible " + // "with underlying device sector size %d\n", + // ssopt, sectsz); + // goto err; + // } + // } + + sectsz = ssopt; + psectsz = pssopt; + psectoff = 0; + } + + bc = calloc(1, sizeof(struct vdsk_raw_ctx)); + if (bc == NULL) { + perror("calloc"); + goto err; + } + + bc->bc_fd = fd; + bc->bc_ischr = S_ISCHR(sbuf.st_mode); + bc->super.bc_isgeom = geom; + bc->super.bc_candelete = candelete; + bc->super.bc_rdonly = ro; + bc->super.bc_size = size; + bc->super.bc_sectsz = sectsz; + bc->super.bc_psectsz = (int) psectsz; + bc->super.bc_psectoff = (int) psectoff; + + bc->super.close = disk_close; + bc->super.read = disk_read; + bc->super.write = disk_write; + bc->super.flush = disk_flush; + bc->super.delete = disk_delete; + + free(nopt); + return (struct vdsk *)bc; +err: + if (fd >= 0) + close(fd); + free(nopt); + return (NULL); +} diff --git a/src/vdsk/vdsk.c b/src/vdsk/vdsk.c new file mode 100644 index 0000000..cd02ba2 --- /dev/null +++ b/src/vdsk/vdsk.c @@ -0,0 +1,166 @@ +/*- + * Copyright (c) 2013 Peter Grehan + * Copyright (c) 2015 xhyve developers + * Copyright (c) 2016 Daniel Borca + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include + +#include +#include +#include +#include +#include +#include + +struct vdsk * +vdsk_open(const char *optstr, int numthr) +{ + /* for now, the one and only backend */ + return vdsk_raw_open(optstr, numthr); +} + +int +vdsk_close(struct vdsk *vdsk) +{ + return vdsk->close(vdsk); +} + +int +vdsk_read(const struct vdsk *vdsk, struct blockif_req *br, uint8_t *buf) +{ + return vdsk->read(vdsk, br, buf); +} + +int +vdsk_write(const struct vdsk *vdsk, struct blockif_req *br, uint8_t *buf) +{ + return vdsk->write(vdsk, br, buf); +} + +int +vdsk_flush(const struct vdsk *vdsk) +{ + return vdsk->flush(vdsk); +} + +int +vdsk_delete(const struct vdsk *vdsk, struct blockif_req *br) +{ + return vdsk->delete(vdsk, br); +} + +uint8_t * +vdsk_physbuf(const struct vdsk *vdsk) +{ + if (vdsk->bc_isgeom) { + return malloc(MAXPHYS); + } + return NULL; +} + +/* + * Return virtual C/H/S values for a given block. Use the algorithm + * outlined in the VHD specification to calculate values. + */ +void +vdsk_chs(const struct vdsk *vdsk, uint16_t *c, uint8_t *h, uint8_t *s) +{ + off_t sectors; /* total sectors of the block dev */ + off_t hcyl; /* cylinders times heads */ + uint16_t secpt; /* sectors per track */ + uint8_t heads; + + sectors = vdsk->bc_size / vdsk->bc_sectsz; + + /* Clamp the size to the largest possible with CHS */ + if (sectors > 65535LL*16*255) + sectors = 65535LL*16*255; + + if (sectors >= 65536LL*16*63) { + secpt = 255; + heads = 16; + hcyl = sectors / secpt; + } else { + secpt = 17; + hcyl = sectors / secpt; + heads = (uint8_t) ((hcyl + 1023) / 1024); + + if (heads < 4) + heads = 4; + + if (hcyl >= (heads * 1024) || heads > 16) { + secpt = 31; + heads = 16; + hcyl = sectors / secpt; + } + if (hcyl >= (heads * 1024)) { + secpt = 63; + heads = 16; + hcyl = sectors / secpt; + } + } + + *c = (uint16_t) (hcyl / heads); + *h = heads; + *s = (uint8_t) secpt; +} + +/* + * Accessors + */ + +off_t +vdsk_size(const struct vdsk *vdsk) +{ + return (vdsk->bc_size); +} + +int +vdsk_sectsz(const struct vdsk *vdsk) +{ + return (vdsk->bc_sectsz); +} + +void +vdsk_psectsz(const struct vdsk *vdsk, int *size, int *off) +{ + *size = vdsk->bc_psectsz; + *off = vdsk->bc_psectoff; +} + +int +vdsk_is_ro(const struct vdsk *vdsk) +{ + return (vdsk->bc_rdonly); +} + +int +vdsk_candelete(const struct vdsk *vdsk) +{ + return (vdsk->bc_candelete); +} From 20c3fcb5fe327e02281e0b5dff363a4951c60456 Mon Sep 17 00:00:00 2001 From: Daniel Borca Date: Wed, 30 Nov 2016 16:55:47 +0200 Subject: [PATCH 2/6] unconst the vdsk API --- include/xhyve/vdsk/vdsk-int.h | 8 ++++---- include/xhyve/vdsk/vdsk.h | 8 ++++---- src/vdsk/vdsk-raw.c | 8 ++++---- src/vdsk/vdsk.c | 12 ++++++------ 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/include/xhyve/vdsk/vdsk-int.h b/include/xhyve/vdsk/vdsk-int.h index 77ef876..da41f0d 100644 --- a/include/xhyve/vdsk/vdsk-int.h +++ b/include/xhyve/vdsk/vdsk-int.h @@ -30,10 +30,10 @@ struct vdsk { int (*close)(struct vdsk *vdsk); - int (*read)(const struct vdsk *vdsk, struct blockif_req *br, uint8_t *buf); - int (*write)(const struct vdsk *vdsk, struct blockif_req *br, uint8_t *buf); - int (*flush)(const struct vdsk *vdsk); - int (*delete)(const struct vdsk *vdsk, struct blockif_req *br); + int (*read)(struct vdsk *vdsk, struct blockif_req *br, uint8_t *buf); + int (*write)(struct vdsk *vdsk, struct blockif_req *br, uint8_t *buf); + int (*flush)(struct vdsk *vdsk); + int (*delete)(struct vdsk *vdsk, struct blockif_req *br); int bc_isgeom; int bc_candelete; int bc_rdonly; diff --git a/include/xhyve/vdsk/vdsk.h b/include/xhyve/vdsk/vdsk.h index a215576..12bcb29 100644 --- a/include/xhyve/vdsk/vdsk.h +++ b/include/xhyve/vdsk/vdsk.h @@ -37,10 +37,10 @@ struct vdsk; struct vdsk *vdsk_open(const char *optstr, int numthr); int vdsk_close(struct vdsk *vdsk); -int vdsk_read(const struct vdsk *vdsk, struct blockif_req *br, uint8_t *buf); -int vdsk_write(const struct vdsk *vdsk, struct blockif_req *br, uint8_t *buf); -int vdsk_flush(const struct vdsk *vdsk); -int vdsk_delete(const struct vdsk *vdsk, struct blockif_req *br); +int vdsk_read(struct vdsk *vdsk, struct blockif_req *br, uint8_t *buf); +int vdsk_write(struct vdsk *vdsk, struct blockif_req *br, uint8_t *buf); +int vdsk_flush(struct vdsk *vdsk); +int vdsk_delete(struct vdsk *vdsk, struct blockif_req *br); void vdsk_chs(const struct vdsk *vdsk, uint16_t *c, uint8_t *h, uint8_t *s); diff --git a/src/vdsk/vdsk-raw.c b/src/vdsk/vdsk-raw.c index 964d8cc..f1cc4a6 100644 --- a/src/vdsk/vdsk-raw.c +++ b/src/vdsk/vdsk-raw.c @@ -95,7 +95,7 @@ disk_close(struct vdsk *vdsk) } static int -disk_read(const struct vdsk *vdsk, struct blockif_req *br, uint8_t *buf) +disk_read(struct vdsk *vdsk, struct blockif_req *br, uint8_t *buf) { const struct vdsk_raw_ctx *vp = (const struct vdsk_raw_ctx *)vdsk; @@ -143,7 +143,7 @@ disk_read(const struct vdsk *vdsk, struct blockif_req *br, uint8_t *buf) } static int -disk_write(const struct vdsk *vdsk, struct blockif_req *br, uint8_t *buf) +disk_write(struct vdsk *vdsk, struct blockif_req *br, uint8_t *buf) { const struct vdsk_raw_ctx *vp = (const struct vdsk_raw_ctx *)vdsk; @@ -196,7 +196,7 @@ disk_write(const struct vdsk *vdsk, struct blockif_req *br, uint8_t *buf) } static int -disk_flush(const struct vdsk *vdsk) +disk_flush(struct vdsk *vdsk) { const struct vdsk_raw_ctx *vp = (const struct vdsk_raw_ctx *)vdsk; @@ -212,7 +212,7 @@ disk_flush(const struct vdsk *vdsk) } static int -disk_delete(const struct vdsk *vdsk, UNUSED struct blockif_req *br) +disk_delete(struct vdsk *vdsk, UNUSED struct blockif_req *br) { // const struct vdsk_raw_ctx *vp = (const struct vdsk_raw_ctx *)vdsk; diff --git a/src/vdsk/vdsk.c b/src/vdsk/vdsk.c index cd02ba2..faa5a8f 100644 --- a/src/vdsk/vdsk.c +++ b/src/vdsk/vdsk.c @@ -40,8 +40,8 @@ struct vdsk * vdsk_open(const char *optstr, int numthr) { - /* for now, the one and only backend */ - return vdsk_raw_open(optstr, numthr); + /* for now, the one and only backend */ + return vdsk_raw_open(optstr, numthr); } int @@ -51,25 +51,25 @@ vdsk_close(struct vdsk *vdsk) } int -vdsk_read(const struct vdsk *vdsk, struct blockif_req *br, uint8_t *buf) +vdsk_read(struct vdsk *vdsk, struct blockif_req *br, uint8_t *buf) { return vdsk->read(vdsk, br, buf); } int -vdsk_write(const struct vdsk *vdsk, struct blockif_req *br, uint8_t *buf) +vdsk_write(struct vdsk *vdsk, struct blockif_req *br, uint8_t *buf) { return vdsk->write(vdsk, br, buf); } int -vdsk_flush(const struct vdsk *vdsk) +vdsk_flush(struct vdsk *vdsk) { return vdsk->flush(vdsk); } int -vdsk_delete(const struct vdsk *vdsk, struct blockif_req *br) +vdsk_delete(struct vdsk *vdsk, struct blockif_req *br) { return vdsk->delete(vdsk, br); } From 26b9ba4909cf301e8c77199a294dbcddf7b929f3 Mon Sep 17 00:00:00 2001 From: Daniel Borca Date: Wed, 30 Nov 2016 16:57:40 +0200 Subject: [PATCH 3/6] support for raw devices --- src/vdsk/vdsk-raw.c | 45 +++++++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/src/vdsk/vdsk-raw.c b/src/vdsk/vdsk-raw.c index f1cc4a6..534934e 100644 --- a/src/vdsk/vdsk-raw.c +++ b/src/vdsk/vdsk-raw.c @@ -246,7 +246,7 @@ vdsk_raw_open(const char *optstr, int numthr) struct vdsk_raw_ctx *bc; struct stat sbuf; // struct diocgattr_arg arg; - off_t size, psectsz, psectoff; + off_t size, psectsz, psectoff, blocks; int extra, fd, sectsz; int nocache, sync, ro, candelete, geom, ssopt, pssopt; @@ -317,9 +317,18 @@ vdsk_raw_open(const char *optstr, int numthr) sectsz = DEV_BSIZE; psectsz = psectoff = 0; candelete = geom = 0; + blocks = 0; if (S_ISCHR(sbuf.st_mode)) { - perror("xhyve: raw device support unimplemented"); - goto err; + if (ioctl(fd, DKIOCGETBLOCKCOUNT, &blocks) < 0 || + ioctl(fd, DKIOCGETBLOCKSIZE, §sz) || + ioctl(fd, DKIOCGETPHYSICALBLOCKSIZE, &psectsz)) + { + perror("Could not fetch dev blk/sector size"); + goto err; + } + size = blocks * sectsz; + assert(size != 0); + assert(psectsz != 0); // if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || // ioctl(fd, DIOCGSECTORSIZE, §sz)) // { @@ -347,21 +356,21 @@ vdsk_raw_open(const char *optstr, int numthr) goto err; } - // /* - // * Some backend drivers (e.g. cd0, ada0) require that the I/O - // * size be a multiple of the device's sector size. - // * - // * Validate that the emulated sector size complies with this - // * requirement. - // */ - // if (S_ISCHR(sbuf.st_mode)) { - // if (ssopt < sectsz || (ssopt % sectsz) != 0) { - // fprintf(stderr, "Sector size %d incompatible " - // "with underlying device sector size %d\n", - // ssopt, sectsz); - // goto err; - // } - // } + /* + * Some backend drivers (e.g. cd0, ada0) require that the I/O + * size be a multiple of the device's sector size. + * + * Validate that the emulated sector size complies with this + * requirement. + */ + if (S_ISCHR(sbuf.st_mode)) { + if (ssopt < sectsz || (ssopt % sectsz) != 0) { + fprintf(stderr, "Sector size %d incompatible " + "with underlying device sector size %d\n", + ssopt, sectsz); + goto err; + } + } sectsz = ssopt; psectsz = pssopt; From 94f51af7770aa7432a119061da356d8ade36e08e Mon Sep 17 00:00:00 2001 From: Daniel Borca Date: Wed, 30 Nov 2016 16:58:39 +0200 Subject: [PATCH 4/6] drop privileges after all PCI devices are initialized --- src/pci_emul.c | 24 ++++++++++++++++++++++++ src/pci_virtio_net_vmnet.c | 24 ------------------------ 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/src/pci_emul.c b/src/pci_emul.c index 5b34694..a52bdcd 100644 --- a/src/pci_emul.c +++ b/src/pci_emul.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -133,6 +134,25 @@ CFGREAD(struct pci_devinst *pi, int coff, int bytes) return (pci_get_cfgdata32(pi, coff)); } +/* + * Drop privileges according to the CERT Secure C Coding Standard section + * POS36-C + * https://www.securecoding.cert.org/confluence/display/c/POS36-C.+Observe+correct+revocation+order+while+relinquishing+privileges +*/ +static int drop_privileges(void) { + // If we are not effectively root, don't drop privileges + if (geteuid() != 0 && getegid() != 0) { + return 0; + } + if (setgid(getgid()) == -1) { + return -1; + } + if (setuid(getuid()) == -1) { + return -1; + } + return 0; +} + /* * I/O access */ @@ -1121,6 +1141,10 @@ init_pci(void) ((uint64_t) BUSMEM_ROUNDUP)); bi->memlimit64 = pci_emul_membase64; } + if (drop_privileges() == -1) { + perror("Dropping privileges after PCI init."); + return (-1); + } /* * PCI backends are initialized before routing INTx interrupts diff --git a/src/pci_virtio_net_vmnet.c b/src/pci_virtio_net_vmnet.c index 4aa285c..8d018ea 100644 --- a/src/pci_virtio_net_vmnet.c +++ b/src/pci_virtio_net_vmnet.c @@ -199,25 +199,6 @@ struct vmnet_state { static void pci_vtnet_tap_callback(struct pci_vtnet_softc *sc); -/* - * Drop privileges according to the CERT Secure C Coding Standard section - * POS36-C - * https://www.securecoding.cert.org/confluence/display/c/POS36-C.+Observe+correct+revocation+order+while+relinquishing+privileges -*/ -static int drop_privileges(void) { - // If we are not effectively root, don't drop privileges - if (geteuid() != 0 && getegid() != 0) { - return 0; - } - if (setgid(getgid()) == -1) { - return -1; - } - if (setuid(getuid()) == -1) { - return -1; - } - return 0; -} - /* * Create an interface for the guest using Apple's vmnet framework. * @@ -313,11 +294,6 @@ vmn_create(struct pci_vtnet_softc *sc) { pci_vtnet_tap_callback(sc); }); - if (drop_privileges() == -1) { - perror("Dropping privileges after networking was enabled."); - free(vms); - return (-1); - } return (0); } From 828bccf051dc6d62fc2228e07675438dbeff9c72 Mon Sep 17 00:00:00 2001 From: Daniel Borca Date: Wed, 30 Nov 2016 17:04:11 +0200 Subject: [PATCH 5/6] each vdsk backend can signal stop further probing --- include/xhyve/vdsk/vdsk-raw.h | 2 +- src/vdsk/vdsk-raw.c | 3 ++- src/vdsk/vdsk.c | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/include/xhyve/vdsk/vdsk-raw.h b/include/xhyve/vdsk/vdsk-raw.h index 4babcf8..dae4c59 100644 --- a/include/xhyve/vdsk/vdsk-raw.h +++ b/include/xhyve/vdsk/vdsk-raw.h @@ -28,4 +28,4 @@ #pragma once -struct vdsk *vdsk_raw_open(const char *optstr, int numthr); +struct vdsk *vdsk_raw_open(const char *optstr, int numthr, int *fatal); diff --git a/src/vdsk/vdsk-raw.c b/src/vdsk/vdsk-raw.c index 534934e..24f950d 100644 --- a/src/vdsk/vdsk-raw.c +++ b/src/vdsk/vdsk-raw.c @@ -239,7 +239,7 @@ disk_delete(struct vdsk *vdsk, UNUSED struct blockif_req *br) } struct vdsk * -vdsk_raw_open(const char *optstr, int numthr) +vdsk_raw_open(const char *optstr, int numthr, int *fatal) { // char name[MAXPATHLEN]; char *nopt, *xopts, *cp; @@ -252,6 +252,7 @@ vdsk_raw_open(const char *optstr, int numthr) assert(numthr == 1); + *fatal = 1; fd = -1; ssopt = 0; nocache = 0; diff --git a/src/vdsk/vdsk.c b/src/vdsk/vdsk.c index faa5a8f..9033687 100644 --- a/src/vdsk/vdsk.c +++ b/src/vdsk/vdsk.c @@ -40,8 +40,9 @@ struct vdsk * vdsk_open(const char *optstr, int numthr) { + int fatal; /* for now, the one and only backend */ - return vdsk_raw_open(optstr, numthr); + return vdsk_raw_open(optstr, numthr, &fatal); } int From f26cfbd7b612a054a8af75efac205498339d6b81 Mon Sep 17 00:00:00 2001 From: Daniel Borca Date: Wed, 30 Nov 2016 17:06:43 +0200 Subject: [PATCH 6/6] support VirtualBox VDI --- Makefile | 1 + include/xhyve/vdsk/vdsk-vdi.h | 31 ++ src/vdsk/vdsk-vdi.c | 606 ++++++++++++++++++++++++++++++++++ src/vdsk/vdsk.c | 7 +- 4 files changed, 644 insertions(+), 1 deletion(-) create mode 100644 include/xhyve/vdsk/vdsk-vdi.h create mode 100644 src/vdsk/vdsk-vdi.c diff --git a/Makefile b/Makefile index 8aa8e2c..ee3daa8 100644 --- a/Makefile +++ b/Makefile @@ -69,6 +69,7 @@ FIRMWARE_SRC := \ VDSK_SRC := \ src/vdsk/vdsk-raw.c \ + src/vdsk/vdsk-vdi.c \ src/vdsk/vdsk.c SRC := \ diff --git a/include/xhyve/vdsk/vdsk-vdi.h b/include/xhyve/vdsk/vdsk-vdi.h new file mode 100644 index 0000000..f564e09 --- /dev/null +++ b/include/xhyve/vdsk/vdsk-vdi.h @@ -0,0 +1,31 @@ +/*- + * Copyright (c) 2016 Daniel Borca + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +struct vdsk *vdsk_vdi_open(const char *optstr, int numthr, int *fatal); diff --git a/src/vdsk/vdsk-vdi.c b/src/vdsk/vdsk-vdi.c new file mode 100644 index 0000000..5e5f2c2 --- /dev/null +++ b/src/vdsk/vdsk-vdi.c @@ -0,0 +1,606 @@ +/*- + * Copyright (c) 2016 Daniel Borca + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +// XXX vvv GPL (snarfed from qemu) +typedef struct { + char text[0x40]; + uint32_t signature; + uint32_t version; + uint32_t header_size; + uint32_t image_type; + uint32_t image_flags; + char description[256]; + uint32_t offset_bmap; + uint32_t offset_data; + uint32_t cylinders; /* disk geometry, unused here */ + uint32_t heads; /* disk geometry, unused here */ + uint32_t sectors; /* disk geometry, unused here */ + uint32_t sector_size; + uint32_t unused1; + uint64_t disk_size; + uint32_t block_size; + uint32_t block_extra; /* unused here */ + uint32_t blocks_in_image; + uint32_t blocks_allocated; + uuid_t uuid_image; + uuid_t uuid_last_snap; + uuid_t uuid_link; + uuid_t uuid_parent; + uint64_t unused2[7]; +} VdiHeader; +// XXX ^^^ GPL (snarfed from qemu) + +struct vdsk_vdi_ctx { + struct vdsk super; + int bc_fd; + int delay; + int dirty; + int blog2; + uint32_t *bmap; + VdiHeader vdi; +}; + +/* xhyve: FIXME + * + * As VDIs probably need multiple reads and writes we can not + * use preadv/pwritev, we need to serialize reads and writes + * for the time being until we find a better solution. + */ + +static int +getlog2(uint32_t n) +{ + int x; + if (!n || !powerof2(n)) { + return -1; + } + for (x = 0; n >>= 1; x++) { + } + return x; +} + +static int +is_zero(char *block, size_t size) +{ +#if 1 + const size_t width = sizeof(long); + while (size && (uintptr_t)block % width) { + if (*block) { + return 0; + } + size--; + block++; + } + while (size >= width) { +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wcast-align" + if (*(long *)block) { + return 0; + } +#pragma clang diagnostic pop + size -= width; + block += width; + } +#endif + while (size) { + if (*block) { + return 0; + } + size--; + block++; + } + return 1; +} + +static int +update_header(struct vdsk_vdi_ctx *vp, size_t first, size_t last) +{ + size_t p0; + size_t p1; + ssize_t n; + if (!vp->dirty) { + return 0; + } + p0 = first * sizeof(uint32_t) & ~((size_t)DEV_BSIZE - 1); + p1 = (last * sizeof(uint32_t) + DEV_BSIZE - 1) & ~((size_t)DEV_BSIZE - 1); + n = pwrite(vp->bc_fd, (char *)vp->bmap + p0, p1 - p0, (off_t)p0 + vp->vdi.offset_bmap); + if (n < 0) { + // XXX corrupted + return errno; + } + n = pwrite(vp->bc_fd, &vp->vdi, sizeof(VdiHeader), 0); + if (n < 0) { + // XXX corrupted + return errno; + } + vp->dirty = 0; + return 0; +} + +static ssize_t +vdi_pread(struct vdsk_vdi_ctx *vp, void *buf, size_t nbyte, off_t offset) +{ + char *dst = buf; + ssize_t n; + size_t chunk; + uint32_t offset_of_block; + size_t block_size = vp->vdi.block_size; + size_t block_num = (size_t)(offset >> vp->blog2); + size_t offset_in = (size_t)offset & (block_size - 1); + int growable = (vp->vdi.image_type == 1); + + while (nbyte) { + chunk = block_size - offset_in; + if (chunk > nbyte) { + chunk = nbyte; + } + offset_of_block = vp->bmap[block_num]; + if (offset_of_block >= 0xfffffffe && growable) { + memset(dst, 0, chunk); + } else { + n = pread(vp->bc_fd, dst, chunk, (off_t)vp->vdi.offset_data + ((off_t)offset_of_block << vp->blog2) + (off_t)offset_in); + if (n < 0) { + goto err; + } + } + nbyte -= chunk; + dst += chunk; + block_num++; + offset_in = 0; + } + + return dst - (char *)buf; +err: + return -1; +} + +static ssize_t +vdi_pwrite(struct vdsk_vdi_ctx *vp, void *buf, size_t nbyte, off_t offset) +{ + char *dst = buf; + ssize_t n; + size_t chunk; + uint32_t offset_of_block; + size_t block_size = vp->vdi.block_size; + size_t block_num = (size_t)(offset >> vp->blog2); + size_t offset_in = (size_t)offset & (block_size - 1); + int growable = (vp->vdi.image_type == 1); + size_t first = 0, last = 0; + char *block = NULL; + + while (nbyte) { + chunk = block_size - offset_in; + if (chunk > nbyte) { + chunk = nbyte; + } + offset_of_block = vp->bmap[block_num]; + if (offset_of_block >= 0xfffffffe && growable) { + if (block == NULL) { + block = malloc(block_size); + if (block == NULL) { + break; + } + first = block_num; + } + last = block_num; + memset(block, 0, offset_in); + memcpy(block + offset_in, dst, chunk); + memset(block + offset_in + chunk, 0, block_size - offset_in - chunk); + if (is_zero(block, block_size)) { + vp->bmap[block_num] = 0xfffffffe; + vp->dirty |= (offset_of_block != 0xfffffffe); + } else { + if (vp->vdi.blocks_allocated == vp->vdi.blocks_in_image) { + goto err; + } + offset_of_block = vp->vdi.blocks_allocated++; + vp->bmap[block_num] = offset_of_block; + vp->dirty |= 1; + n = pwrite(vp->bc_fd, block, block_size, (off_t)vp->vdi.offset_data + ((off_t)offset_of_block << vp->blog2)); + if (n < 0) { + goto err; + } + } + } else { + n = pwrite(vp->bc_fd, dst, chunk, (off_t)vp->vdi.offset_data + ((off_t)offset_of_block << vp->blog2) + (off_t)offset_in); + if (n < 0) { + goto err; + } + } + nbyte -= chunk; + dst += chunk; + block_num++; + offset_in = 0; + } + + if (!vp->delay) { + int rv = update_header(vp, first, last + 1); + if (rv < 0) { + goto err; + } + } + + free(block); + return dst - (char *)buf; +err: + free(block); + return -1; +} + +static ssize_t +preadv(struct vdsk_vdi_ctx *vp, const struct iovec *iov, int iovcnt, off_t offset) +{ + int i; + ssize_t len, total = 0; + + for (i = 0; i < iovcnt; i++) { + len = vdi_pread(vp, iov[i].iov_base, iov[i].iov_len, offset); + if (len < 0) { + return len; + } + total += len; + offset += iov[i].iov_len; + } + + return total; +} + +static ssize_t +pwritev(struct vdsk_vdi_ctx *vp, const struct iovec *iov, int iovcnt, off_t offset) +{ + int i; + ssize_t len, total = 0; + + for (i = 0; i < iovcnt; i++) { + len = vdi_pwrite(vp, iov[i].iov_base, iov[i].iov_len, offset); + if (len < 0) { + return len; + } + total += len; + offset += iov[i].iov_len; + } + + return total; +} + +static int +disk_close(struct vdsk *vdsk) +{ + struct vdsk_vdi_ctx *vp = (struct vdsk_vdi_ctx *)vdsk; + + if (vp->delay) { + update_header(vp, 0, vp->vdi.blocks_in_image); + } + free(vp->bmap); + close(vp->bc_fd); + free(vp); + + return (0); +} + +static int +disk_read(struct vdsk *vdsk, struct blockif_req *br, uint8_t *buf) +{ + struct vdsk_vdi_ctx *vp = (struct vdsk_vdi_ctx *)vdsk; + + ssize_t clen, len, off, boff, voff; + int i, err; + + err = 0; + + if (buf == NULL) { + if ((len = preadv(vp, br->br_iov, br->br_iovcnt, + br->br_offset)) < 0) + err = errno; + else + br->br_resid -= len; + return err; + } + i = 0; + off = voff = 0; + while (br->br_resid > 0) { + len = MIN(br->br_resid, MAXPHYS); + if (vdi_pread(vp, buf, ((size_t) len), br->br_offset + off) < 0) + { + err = errno; + break; + } + boff = 0; + do { + clen = MIN((len - boff), + (((ssize_t) br->br_iov[i].iov_len) - voff)); + memcpy(((void *) (((uintptr_t) br->br_iov[i].iov_base) + + ((size_t) voff))), buf + boff, clen); + if (clen < (((ssize_t) br->br_iov[i].iov_len) - voff)) + voff += clen; + else { + i++; + voff = 0; + } + boff += clen; + } while (boff < len); + off += len; + br->br_resid -= len; + } + + return err; +} + +static int +disk_write(struct vdsk *vdsk, struct blockif_req *br, uint8_t *buf) +{ + struct vdsk_vdi_ctx *vp = (struct vdsk_vdi_ctx *)vdsk; + + ssize_t clen, len, off, boff, voff; + int i, err; + + err = 0; + + if (vdsk->bc_rdonly) { + err = EROFS; + return err; + } + if (buf == NULL) { + if ((len = pwritev(vp, br->br_iov, br->br_iovcnt, + br->br_offset)) < 0) + err = errno; + else + br->br_resid -= len; + return err; + } + i = 0; + off = voff = 0; + while (br->br_resid > 0) { + len = MIN(br->br_resid, MAXPHYS); + boff = 0; + do { + clen = MIN((len - boff), + (((ssize_t) br->br_iov[i].iov_len) - voff)); + memcpy((buf + boff), + ((void *) (((uintptr_t) br->br_iov[i].iov_base) + + ((size_t) voff))), clen); + if (clen < (((ssize_t) br->br_iov[i].iov_len) - voff)) + voff += clen; + else { + i++; + voff = 0; + } + boff += clen; + } while (boff < len); + if (vdi_pwrite(vp, buf, ((size_t) len), br->br_offset + + off) < 0) { + err = errno; + break; + } + off += len; + br->br_resid -= len; + } + + return err; +} + +static int +disk_flush(struct vdsk *vdsk) +{ + struct vdsk_vdi_ctx *vp = (struct vdsk_vdi_ctx *)vdsk; + + int err = 0; + + if (vp->delay) { + err = update_header(vp, 0, vp->vdi.blocks_in_image); + } + if (fsync(vp->bc_fd)) + err = errno; + + return err; +} + +static int +disk_delete(struct vdsk *vdsk, UNUSED struct blockif_req *br) +{ + int err = 0; + + if (vdsk->bc_rdonly) { + err = EROFS; + } else { + err = EOPNOTSUPP; + } + + return err; +} + +struct vdsk * +vdsk_vdi_open(const char *optstr, int numthr, int *fatal) +{ + char *nopt, *xopts, *cp; + struct vdsk_vdi_ctx *bc; + struct stat sbuf; + off_t psectsz; + int extra, fd; + int ro, delay; + int block_shift; + VdiHeader header; + uint32_t *bmap; + size_t sz, bmap_size; + + assert(numthr == 1); + + *fatal = 1; + fd = -1; + ro = 0; + delay = 0; + + /* + * The first element in the optstring is always a pathname. + * Optional elements follow + */ + nopt = xopts = strdup(optstr); + while (xopts != NULL) { + cp = strsep(&xopts, ","); + if (cp == nopt) /* file or device pathname */ + continue; + else if (!strcmp(cp, "ro")) + ro = 1; + else if (!strcmp(cp, "delay")) + delay = 1; + else { + fprintf(stderr, "Invalid device option \"%s\"\n", cp); + goto err; + } + } + + extra = 0; + + fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra); + if (fd < 0 && !ro) { + /* Attempt a r/w fail with a r/o open */ + fd = open(nopt, O_RDONLY | extra); + ro = 1; + } + + if (fd < 0) { + perror("Could not open backing file"); + goto err; + } + + if (fstat(fd, &sbuf) < 0) { + perror("Could not stat backing file"); + goto err; + } + psectsz = sbuf.st_blksize; + + sz = (size_t)read(fd, &header, sizeof(header)); + if (sz != sizeof(header)) { + perror("Could not read backing file"); + goto err; + } + if (header.signature != 0xBEDA107F) { + *fatal = 0; + goto err; + } + if (header.version != 0x00010001) { + goto err; + } + if (header.sector_size != DEV_BSIZE || + header.offset_bmap % DEV_BSIZE || + header.offset_data % DEV_BSIZE || + header.block_size % header.sector_size || + (block_shift = getlog2(header.block_size)) < 0) { + fprintf(stderr, "Invalid VDI\n"); + goto err; + } + if (header.blocks_allocated > header.blocks_in_image) { + fprintf(stderr, "Invalid VDI\n"); + goto err; + } + if (header.image_type != 1 && header.image_type != 2) { + fprintf(stderr, "Invalid VDI\n"); + goto err; + } + if (header.image_type == 1 && header.image_flags & 2) { + fprintf(stderr, "Invalid VDI\n"); + goto err; + } + if (header.image_type == 2 && header.blocks_allocated != header.blocks_in_image) { + fprintf(stderr, "Invalid VDI\n"); + goto err; + } + + bmap_size = (header.blocks_in_image * sizeof(uint32_t) + DEV_BSIZE - 1) & ~((size_t)DEV_BSIZE - 1); + + bmap = malloc(bmap_size); + if (!bmap) { + perror("malloc"); + goto err; + } + + sz = (size_t)pread(fd, bmap, bmap_size, header.offset_bmap); + if (sz != bmap_size) { + perror("bmap"); + goto err2; + } + + bc = calloc(1, sizeof(struct vdsk_vdi_ctx)); + if (bc == NULL) { + perror("calloc"); + goto err2; + } + + bc->bc_fd = fd; + bc->delay = (short)delay; + bc->dirty = 0; + bc->blog2 = block_shift; + bc->bmap = bmap; + memcpy(&bc->vdi, &header, sizeof(header)); + bc->super.bc_isgeom = 0; + bc->super.bc_candelete = 0; + bc->super.bc_rdonly = ro; + bc->super.bc_size = (off_t)header.disk_size; + bc->super.bc_sectsz = DEV_BSIZE; + bc->super.bc_psectsz = (int) psectsz; + bc->super.bc_psectoff = 0; + + bc->super.close = disk_close; + bc->super.read = disk_read; + bc->super.write = disk_write; + bc->super.flush = disk_flush; + bc->super.delete = disk_delete; + + free(nopt); + return (struct vdsk *)bc; +err2: + free(bmap); +err: + if (fd >= 0) + close(fd); + free(nopt); + return (NULL); +} diff --git a/src/vdsk/vdsk.c b/src/vdsk/vdsk.c index 9033687..0caf698 100644 --- a/src/vdsk/vdsk.c +++ b/src/vdsk/vdsk.c @@ -36,12 +36,17 @@ #include #include #include +#include struct vdsk * vdsk_open(const char *optstr, int numthr) { int fatal; - /* for now, the one and only backend */ + struct vdsk *vdsk; + vdsk = vdsk_vdi_open(optstr, numthr, &fatal); + if (vdsk || fatal) { + return vdsk; + } return vdsk_raw_open(optstr, numthr, &fatal); }