/*- * Copyright (c) 2018 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include "vmmapi.h" #define HUGETLB_LV1 0 #define HUGETLB_LV2 1 #define HUGETLB_LV_MAX 2 #define MAX_PATH_LEN 128 #define HUGETLBFS_MAGIC 0x958458f6 /* HugePage Level 1 for 2M page, Level 2 for 1G page*/ #define PATH_HUGETLB_LV1 "/run/hugepage/acrn/huge_lv1/" #define OPT_HUGETLB_LV1 "pagesize=2M" #define PATH_HUGETLB_LV2 "/run/hugepage/acrn/huge_lv2/" #define OPT_HUGETLB_LV2 "pagesize=1G" #define SYS_PATH_LV1 "/sys/kernel/mm/hugepages/hugepages-2048kB/" #define SYS_PATH_LV2 "/sys/kernel/mm/hugepages/hugepages-1048576kB/" #define SYS_NR_HUGEPAGES "nr_hugepages" #define SYS_FREE_HUGEPAGES "free_hugepages" /* hugetlb_info record private information for one specific hugetlbfs: * - mounted: is hugetlbfs mounted for below mount_path * - mount_path: hugetlbfs mount path * - mount_opt: hugetlb mount option * - node_path: record for hugetlbfs node path * - pg_size: this hugetlbfs's page size * - lowmem: lowmem of this hugetlbfs need allocate * - highmem: highmem of this hugetlbfs need allocate *.- pages_delta: its value equals needed pages - free pages, *.---if > 0: it's the gap for needed page; if < 0, more free than needed. * - nr_pages_path: sys path for total number of pages *.- free_pages_path: sys path for number of free pages */ struct hugetlb_info { bool mounted; char *mount_path; char *mount_opt; char node_path[MAX_PATH_LEN]; int fd; int pg_size; size_t lowmem; size_t highmem; int pages_delta; char *nr_pages_path; char *free_pages_path; }; static struct hugetlb_info hugetlb_priv[HUGETLB_LV_MAX] = { { .mounted = false, .mount_path = PATH_HUGETLB_LV1, .mount_opt = OPT_HUGETLB_LV1, .fd = -1, .pg_size = 0, .lowmem = 0, .highmem = 0, .pages_delta = 0, .nr_pages_path = SYS_PATH_LV1 SYS_NR_HUGEPAGES, .free_pages_path = SYS_PATH_LV1 SYS_FREE_HUGEPAGES, }, { .mounted = false, .mount_path = PATH_HUGETLB_LV2, .mount_opt = OPT_HUGETLB_LV2, .fd = -1, .pg_size = 0, .lowmem = 0, .highmem = 0, .pages_delta = 0, .nr_pages_path = SYS_PATH_LV2 SYS_NR_HUGEPAGES, .free_pages_path = SYS_PATH_LV2 SYS_FREE_HUGEPAGES, }, }; static void *ptr; static size_t total_size; static int hugetlb_lv_max; static int open_hugetlbfs(struct vmctx *ctx, int level) { char uuid_str[48]; uint8_t UUID[16]; char *path; size_t len; struct statfs fs; if (level >= HUGETLB_LV_MAX) { perror("exceed max hugetlb level"); return -EINVAL; } path = hugetlb_priv[level].node_path; memset(path, '\0', MAX_PATH_LEN); strncpy(path, hugetlb_priv[level].mount_path, MAX_PATH_LEN); len = strnlen(path, MAX_PATH_LEN); /* UUID will use 32 bytes */ if (len + 32 > MAX_PATH_LEN) { perror("PATH overflow"); return -ENOMEM; } uuid_copy(UUID, ctx->vm_uuid); sprintf(uuid_str, "%02X%02X%02X%02X%02X%02X%02X%02X" "%02X%02X%02X%02X%02X%02X%02X%02X\n", UUID[0], UUID[1], UUID[2], UUID[3], UUID[4], UUID[5], UUID[6], UUID[7], UUID[8], UUID[9], UUID[10], UUID[11], UUID[12], UUID[13], UUID[14], UUID[15]); *(path + len) = '\0'; strncat(path, uuid_str, strlen(uuid_str)); printf("open hugetlbfs file %s\n", path); hugetlb_priv[level].fd = open(path, O_CREAT | O_RDWR, 0644); if (hugetlb_priv[level].fd < 0) { perror("Open hugtlbfs failed"); return -EINVAL; } /* get the pagesize */ if (fstatfs(hugetlb_priv[level].fd, &fs) != 0) { perror("Failed to get statfs fo hugetlbfs"); return -EINVAL; } if (fs.f_type == HUGETLBFS_MAGIC) { /* get hugepage size from fstat*/ hugetlb_priv[level].pg_size = fs.f_bsize; } else { close(hugetlb_priv[level].fd); unlink(hugetlb_priv[level].node_path); hugetlb_priv[level].fd = -1; return -EINVAL; } return 0; } static void close_hugetlbfs(int level) { if (level >= HUGETLB_LV_MAX) { perror("exceed max hugetlb level"); return; } if (hugetlb_priv[level].fd >= 0) { close(hugetlb_priv[level].fd); hugetlb_priv[level].fd = -1; unlink(hugetlb_priv[level].node_path); hugetlb_priv[level].pg_size = 0; } } static bool should_enable_hugetlb_level(int level) { if (level >= HUGETLB_LV_MAX) { perror("exceed max hugetlb level"); return false; } return (hugetlb_priv[level].lowmem > 0 || hugetlb_priv[level].highmem > 0); } /* * level : hugepage level * len : region length for mmap * offset : region start offset from ctx->baseaddr * skip : skip offset in different level hugetlbfs fd */ static int mmap_hugetlbfs(struct vmctx *ctx, int level, size_t len, size_t offset, size_t skip) { char *addr; size_t pagesz = 0; int fd, i; if (level >= HUGETLB_LV_MAX) { perror("exceed max hugetlb level"); return -EINVAL; } fd = hugetlb_priv[level].fd; addr = mmap(ctx->baseaddr + offset, len, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, skip); if (addr == MAP_FAILED) return -ENOMEM; printf("mmap 0x%lx@%p\n", len, addr); /* pre-allocate hugepages by touch them */ pagesz = hugetlb_priv[level].pg_size; printf("touch %ld pages with pagesz 0x%lx\n", len/pagesz, pagesz); for (i = 0; i < len/pagesz; i++) { *(volatile char *)addr = *addr; addr += pagesz; } return 0; } static int mmap_hugetlbfs_lowmem(struct vmctx *ctx) { size_t len, offset, skip; int level, ret = 0, pg_size; offset = skip = 0; for (level = hugetlb_lv_max - 1; level >= HUGETLB_LV1; level--) { len = hugetlb_priv[level].lowmem; pg_size = hugetlb_priv[level].pg_size; while (len > 0) { ret = mmap_hugetlbfs(ctx, level, len, offset, skip); if (ret < 0 && level > HUGETLB_LV1) { len -= pg_size; hugetlb_priv[level].lowmem = len; hugetlb_priv[level-1].lowmem += pg_size; } else if (ret < 0 && level == HUGETLB_LV1) return ret; else { offset += len; break; } } } return 0; } static int mmap_hugetlbfs_highmem(struct vmctx *ctx) { size_t len, offset, skip; int level, ret = 0, pg_size; offset = 4 * GB; for (level = hugetlb_lv_max - 1; level >= HUGETLB_LV1; level--) { skip = hugetlb_priv[level].lowmem; len = hugetlb_priv[level].highmem; pg_size = hugetlb_priv[level].pg_size; while (len > 0) { ret = mmap_hugetlbfs(ctx, level, len, offset, skip); if (ret < 0 && level > HUGETLB_LV1) { len -= pg_size; hugetlb_priv[level].highmem = len; hugetlb_priv[level-1].highmem += pg_size; } else if (ret < 0 && level == HUGETLB_LV1) return ret; else { offset += len; break; } } } return 0; } static int create_hugetlb_dirs(int level) { char tmp_path[MAX_PATH_LEN], *path; int i; size_t len; if (level >= HUGETLB_LV_MAX) { perror("exceed max hugetlb level"); return -EINVAL; } path = hugetlb_priv[level].mount_path; len = strlen(path); if (len >= MAX_PATH_LEN || len == 0) { perror("invalid path len"); return -EINVAL; } memset(tmp_path, '\0', MAX_PATH_LEN); strncpy(tmp_path, path, MAX_PATH_LEN - 1); if ((tmp_path[len - 1] != '/') && (strlen(tmp_path) < MAX_PATH_LEN - 1)) strcat(tmp_path, "/"); len = strlen(tmp_path); for (i = 1; i < len; i++) { if (tmp_path[i] == '/') { tmp_path[i] = 0; if (access(tmp_path, F_OK) != 0) { if (mkdir(tmp_path, 0755) < 0) { perror("mkdir failed"); return -1; } } tmp_path[i] = '/'; } } return 0; } static int mount_hugetlbfs(int level) { int ret; if (level >= HUGETLB_LV_MAX) { perror("exceed max hugetlb level"); return -EINVAL; } if (hugetlb_priv[level].mounted) return 0; /* only support x86 as HUGETLB level-1 2M page, level-2 1G page*/ ret = mount("none", hugetlb_priv[level].mount_path, "hugetlbfs", 0, hugetlb_priv[level].mount_opt); if (ret == 0) hugetlb_priv[level].mounted = true; return ret; } static void umount_hugetlbfs(int level) { if (level >= HUGETLB_LV_MAX) { perror("exceed max hugetlb level"); return; } if (hugetlb_priv[level].mounted) { umount(hugetlb_priv[level].mount_path); hugetlb_priv[level].mounted = false; } } static int read_sys_info(const char *sys_path) { FILE *fp; char tmp_buf[12]; int pages = 0; int result; fp = fopen(sys_path, "r"); if (fp == NULL) { printf("can't open: %s, err: %s\n", sys_path, strerror(errno)); return 0; } memset(tmp_buf, 0, 12); result = fread(&tmp_buf, sizeof(char), 8, fp); if (result <= 0) printf("read %s, error: %s, please check!\n", sys_path, strerror(errno)); else pages = strtol(tmp_buf, NULL, 10); fclose(fp); return pages; } /* check if enough free huge pages for the UOS */ static bool hugetlb_check_memgap(void) { int lvl, free_pages, need_pages; bool has_gap = false; for (lvl = HUGETLB_LV1; lvl < hugetlb_lv_max; lvl++) { free_pages = read_sys_info(hugetlb_priv[lvl].free_pages_path); need_pages = (hugetlb_priv[lvl].lowmem + hugetlb_priv[lvl].highmem) / hugetlb_priv[lvl].pg_size; hugetlb_priv[lvl].pages_delta = need_pages - free_pages; /* if delta > 0, it's a gap for needed pages, to be handled */ if (hugetlb_priv[lvl].pages_delta > 0) has_gap = true; printf("level %d free/need pages:%d/%d page size:0x%x\n", lvl, free_pages, need_pages, hugetlb_priv[lvl].pg_size); } return has_gap; } /* try to reserve more huge pages on the level */ static void reserve_more_pages(int level) { int total_pages, orig_pages, cur_pages; char cmd_buf[MAX_PATH_LEN]; FILE *fp; orig_pages = read_sys_info(hugetlb_priv[level].nr_pages_path); total_pages = orig_pages + hugetlb_priv[level].pages_delta; snprintf(cmd_buf, MAX_PATH_LEN, "echo %d > %s", total_pages, hugetlb_priv[level].nr_pages_path); /* system cmd to reserve needed huge pages */ fp = popen(cmd_buf, "r"); if (fp == NULL) { printf("cmd: %s failed!\n", cmd_buf); return; } pclose(fp); printf("to reserve pages (+orig %d): %s\n", orig_pages, cmd_buf); cur_pages = read_sys_info(hugetlb_priv[level].nr_pages_path); hugetlb_priv[level].pages_delta = total_pages - cur_pages; } /* try to release larger free page */ static bool release_larger_freepage(int level_limit) { int level; int total_pages, orig_pages, cur_pages; char cmd_buf[MAX_PATH_LEN]; FILE *fp; for (level = hugetlb_lv_max - 1; level >= level_limit; level--) { if (hugetlb_priv[level].pages_delta >= 0) continue; /* free one unsed larger page */ orig_pages = read_sys_info(hugetlb_priv[level].nr_pages_path); total_pages = orig_pages - 1; snprintf(cmd_buf, MAX_PATH_LEN, "echo %d > %s", total_pages, hugetlb_priv[level].nr_pages_path); fp = popen(cmd_buf, "r"); if (fp == NULL) { printf("cmd to free mem: %s failed!\n", cmd_buf); return false; } pclose(fp); cur_pages = read_sys_info(hugetlb_priv[level].nr_pages_path); /* release page successfully */ if (cur_pages < orig_pages) { hugetlb_priv[level].pages_delta++; break; } } if (level < level_limit) return false; return true; } /* reserve more free huge pages as different levels. * it need handle following cases: * A.no enough free memory to reserve gap pages, just fails. * B enough free memory to reserve each level gap pages *.C.enough free memory, but it can't reserve enough higher level gap pages, * so lower level need handle that, to reserve more free pages. *.D.enough higher level free pages, but not enough free memory for * lower level gap pages, so release some higher level free pages for that. * other info: *. even enough free memory, it is eaiser to reserve smaller pages than * lager ones, for example:2MB easier than 1GB. One flow of current solution: *.it could leave SOS very small free memory. *.return value: true: success; false: failure */ static bool hugetlb_reserve_pages(void) { int left_gap = 0, pg_size; int level; printf("to reserve more free pages:\n"); for (level = hugetlb_lv_max - 1; level >= HUGETLB_LV1; level--) { if (hugetlb_priv[level].pages_delta <= 0) continue; /* if gaps, try to reserve more pages */ reserve_more_pages(level); /* check if reserved enough pages */ if (hugetlb_priv[level].pages_delta == 0) continue; /* probably system allocates fewer pages than needed * especially for larger page like 1GB, even there is enough * free memory, it stil can fail to allocate 1GB huge page. * so if that,it needs the next level to handle it. */ if (level > HUGETLB_LV1) { left_gap = hugetlb_priv[level].pages_delta; pg_size = hugetlb_priv[level].pg_size; hugetlb_priv[level - 1].pages_delta += (size_t)left_gap * pg_size / hugetlb_priv[level - 1].pg_size; continue; } /* now for level == HUGETLB_LV1 it still can't alloc enough * pages, go back to larger pages, to check if it can *.release some unused free pages, if release success, * let it go LV1 again */ if (release_larger_freepage(level + 1)) level++; else break; } if (level >= HUGETLB_LV1) { printf("level %d pages gap: %d failed to reserve!\n", level, left_gap); return false; } printf("now enough free pages are reserved!\n"); return true; } bool check_hugetlb_support(void) { int level; for (level = HUGETLB_LV1; level < HUGETLB_LV_MAX; level++) { if (create_hugetlb_dirs(level) < 0) return false; } for (level = HUGETLB_LV1; level < HUGETLB_LV_MAX; level++) { if (mount_hugetlbfs(level) < 0) { level--; break; } } if (level < HUGETLB_LV1) /* mount fail for level 1 */ return false; else if (level == HUGETLB_LV1) /* mount fail for level 2 */ printf("WARNING: only level 1 hugetlb supported"); hugetlb_lv_max = level; return true; } int hugetlb_setup_memory(struct vmctx *ctx) { int level; size_t lowmem, highmem; bool has_gap; /* for first time DM start UOS, hugetlbfs is already mounted by * check_hugetlb_support; but for reboot, here need re-mount * it as it already be umount by hugetlb_unsetup_memory * TODO: actually, correct reboot process should not change memory * layout, the setup_memory should be removed from reboot process */ for (level = HUGETLB_LV1; level < hugetlb_lv_max; level++) mount_hugetlbfs(level); /* open hugetlbfs and get pagesize for two level */ for (level = HUGETLB_LV1; level < hugetlb_lv_max; level++) { if (open_hugetlbfs(ctx, level) < 0) { perror("failed to open hugetlbfs"); goto err; } } /* all memory should be at least align with * hugetlb_priv[HUGETLB_LV1].pg_size */ ctx->lowmem = ALIGN_DOWN(ctx->lowmem, hugetlb_priv[HUGETLB_LV1].pg_size); ctx->highmem = ALIGN_DOWN(ctx->highmem, hugetlb_priv[HUGETLB_LV1].pg_size); if (ctx->highmem > 0) total_size = 4 * GB + ctx->highmem; else total_size = ctx->lowmem; if (total_size == 0) { perror("vm request 0 memory"); goto err; } /* check & set hugetlb level memory size for lowmem & highmem */ highmem = ctx->highmem; lowmem = ctx->lowmem; for (level = hugetlb_lv_max - 1; level >= HUGETLB_LV1; level--) { hugetlb_priv[level].lowmem = ALIGN_DOWN(lowmem, hugetlb_priv[level].pg_size); hugetlb_priv[level].highmem = ALIGN_DOWN(highmem, hugetlb_priv[level].pg_size); if (level > HUGETLB_LV1) { hugetlb_priv[level-1].lowmem = lowmem = lowmem - hugetlb_priv[level].lowmem; hugetlb_priv[level-1].highmem = highmem = highmem - hugetlb_priv[level].highmem; } } /* it will check each level memory need */ has_gap = hugetlb_check_memgap(); if (has_gap) { if (!hugetlb_reserve_pages()) goto err; } /* align up total size with huge page size for vma alignment */ for (level = hugetlb_lv_max - 1; level >= HUGETLB_LV1; level--) { if (should_enable_hugetlb_level(level)) { total_size += hugetlb_priv[level].pg_size; break; } } /* dump hugepage trying to setup */ printf("\ntry to setup hugepage with:\n"); for (level = HUGETLB_LV1; level < hugetlb_lv_max; level++) { printf("\tlevel %d - lowmem 0x%lx, highmem 0x%lx\n", level, hugetlb_priv[level].lowmem, hugetlb_priv[level].highmem); } printf("total_size 0x%lx\n\n", total_size); /* basic overview vma */ ptr = mmap(NULL, total_size, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (ptr == MAP_FAILED) { perror("anony mmap fail"); goto err; } /* align up baseaddr according to hugepage level size */ for (level = hugetlb_lv_max - 1; level >= HUGETLB_LV1; level--) { if (should_enable_hugetlb_level(level)) { ctx->baseaddr = (void *)ALIGN_UP((size_t)ptr, hugetlb_priv[level].pg_size); break; } } printf("mmap ptr 0x%p -> baseaddr 0x%p\n", ptr, ctx->baseaddr); /* mmap lowmem */ if (mmap_hugetlbfs_lowmem(ctx) < 0) goto err; /* mmap highmem */ if (mmap_hugetlbfs_highmem(ctx) < 0) goto err; /* dump hugepage really setup */ printf("\nreally setup hugepage with:\n"); for (level = HUGETLB_LV1; level < hugetlb_lv_max; level++) { printf("\tlevel %d - lowmem 0x%lx, highmem 0x%lx\n", level, hugetlb_priv[level].lowmem, hugetlb_priv[level].highmem); } printf("total_size 0x%lx\n\n", total_size); /* map ept for lowmem*/ if (vm_map_memseg_vma(ctx, ctx->lowmem, 0, (uint64_t)ctx->baseaddr, PROT_ALL) < 0) goto err; /* map ept for highmem*/ if (ctx->highmem > 0) { if (vm_map_memseg_vma(ctx, ctx->highmem, 4 * GB, (uint64_t)(ctx->baseaddr + 4 * GB), PROT_ALL) < 0) goto err; } return 0; err: if (ptr) { munmap(ptr, total_size); ptr = NULL; } for (level = HUGETLB_LV1; level < hugetlb_lv_max; level++) { close_hugetlbfs(level); umount_hugetlbfs(level); } return -ENOMEM; } void hugetlb_unsetup_memory(struct vmctx *ctx) { int level; if (total_size > 0) { munmap(ptr, total_size); total_size = 0; ptr = NULL; } for (level = HUGETLB_LV1; level < hugetlb_lv_max; level++) { close_hugetlbfs(level); umount_hugetlbfs(level); } }