diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2d31d811a52d138de9439dff0f58c772b7d0d11d..86aae1fa099a7ba0fcd1fe3f7cb65f9a904057f0 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2573,13 +2573,22 @@ For details see: Documentation/admin-guide/hw-vuln/mds.rst mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory - Amount of memory to be used when the kernel is not able - to see the whole system memory or for test. + Amount of memory to be used in cases as follows: + + 1 for test; + 2 when the kernel is not able to see the whole system memory; + 3 memory that lies after 'mem=' boundary is excluded from + the hypervisor, then assigned to KVM guests. + [X86] Work as limiting max address. Use together with memmap= to avoid physical address space collisions. Without memmap= PCI devices could be placed at addresses belonging to unused RAM. + Note that this only takes effects during boot time since + in above case 3, memory may need be hot added after boot + if system memory of hypervisor is not sufficient. + mem=nopentium [BUGS=X86-32] Disable usage of 4MB pages for kernel memory. diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index bd5714547cee0cbb06edd31d59c0d4f12476ad33..2f31de8f7c749ec70a8e6e33fc7b779276410e26 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -310,6 +310,11 @@ thp_fault_fallback is incremented if a page fault fails to allocate a huge page and instead falls back to using small pages. +thp_fault_fallback_charge + is incremented if a page fault fails to charge a huge page and + instead falls back to using small pages even though the + allocation was successful. + thp_collapse_alloc_failed is incremented if khugepaged found a range of pages that should be collapsed into one huge page but failed @@ -319,6 +324,15 @@ thp_file_alloc is incremented every time a file huge page is successfully allocated. +thp_file_fallback + is incremented if a file huge page is attempted to be allocated + but fails and instead falls back to using small pages. + +thp_file_fallback_charge + is incremented if a file huge page cannot be charged and instead + falls back to using small pages even though the allocation was + successful. + thp_file_mapped is incremented every time a file huge page is mapped into user address space. diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst index 5048cf661a8ae72e9d04a066d574922f92facf22..c30176e67900459e3d3ce4c634606a0642f17850 100644 --- a/Documentation/admin-guide/mm/userfaultfd.rst +++ b/Documentation/admin-guide/mm/userfaultfd.rst @@ -108,6 +108,57 @@ UFFDIO_COPY. They're atomic as in guaranteeing that nothing can see an half copied page since it'll keep userfaulting until the copy has finished. +Notes: + +- If you requested UFFDIO_REGISTER_MODE_MISSING when registering then + you must provide some kind of page in your thread after reading from + the uffd. You must provide either UFFDIO_COPY or UFFDIO_ZEROPAGE. + The normal behavior of the OS automatically providing a zero page on + an annonymous mmaping is not in place. + +- None of the page-delivering ioctls default to the range that you + registered with. You must fill in all fields for the appropriate + ioctl struct including the range. + +- You get the address of the access that triggered the missing page + event out of a struct uffd_msg that you read in the thread from the + uffd. You can supply as many pages as you want with UFFDIO_COPY or + UFFDIO_ZEROPAGE. Keep in mind that unless you used DONTWAKE then + the first of any of those IOCTLs wakes up the faulting thread. + +- Be sure to test for all errors including (pollfd[0].revents & + POLLERR). This can happen, e.g. when ranges supplied were + incorrect. + +Write Protect Notifications +--------------------------- + +This is equivalent to (but faster than) using mprotect and a SIGSEGV +signal handler. + +Firstly you need to register a range with UFFDIO_REGISTER_MODE_WP. +Instead of using mprotect(2) you use ioctl(uffd, UFFDIO_WRITEPROTECT, +struct *uffdio_writeprotect) while mode = UFFDIO_WRITEPROTECT_MODE_WP +in the struct passed in. The range does not default to and does not +have to be identical to the range you registered with. You can write +protect as many ranges as you like (inside the registered range). +Then, in the thread reading from uffd the struct will have +msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP set. Now you send +ioctl(uffd, UFFDIO_WRITEPROTECT, struct *uffdio_writeprotect) again +while pagefault.mode does not have UFFDIO_WRITEPROTECT_MODE_WP set. +This wakes up the thread which will continue to run with writes. This +allows you to do the bookkeeping about the write in the uffd reading +thread before the ioctl. + +If you registered with both UFFDIO_REGISTER_MODE_MISSING and +UFFDIO_REGISTER_MODE_WP then you need to think about the sequence in +which you supply a page and undo write protect. Note that there is a +difference between writes into a WP area and into a !WP area. The +former will have UFFD_PAGEFAULT_FLAG_WP set, the latter +UFFD_PAGEFAULT_FLAG_WRITE. The latter did not fail on protection but +you still need to supply a page when UFFDIO_REGISTER_MODE_MISSING was +used. + QEMU/KVM ======== diff --git a/Documentation/vm/free_page_reporting.rst b/Documentation/vm/free_page_reporting.rst new file mode 100644 index 0000000000000000000000000000000000000000..8c05e62d8b2b6ae4da2cd0c193eb21c35161d95a --- /dev/null +++ b/Documentation/vm/free_page_reporting.rst @@ -0,0 +1,40 @@ +.. _free_page_reporting: + +===================== +Free Page Reporting +===================== + +Free page reporting is an API by which a device can register to receive +lists of pages that are currently unused by the system. This is useful in +the case of virtualization where a guest is then able to use this data to +notify the hypervisor that it is no longer using certain pages in memory. + +For the driver, typically a balloon driver, to use of this functionality +it will allocate and initialize a page_reporting_dev_info structure. The +field within the structure it will populate is the "report" function +pointer used to process the scatterlist. It must also guarantee that it can +handle at least PAGE_REPORTING_CAPACITY worth of scatterlist entries per +call to the function. A call to page_reporting_register will register the +page reporting interface with the reporting framework assuming no other +page reporting devices are already registered. + +Once registered the page reporting API will begin reporting batches of +pages to the driver. The API will start reporting pages 2 seconds after +the interface is registered and will continue to do so 2 seconds after any +page of a sufficiently high order is freed. + +Pages reported will be stored in the scatterlist passed to the reporting +function with the final entry having the end bit set in entry nent - 1. +While pages are being processed by the report function they will not be +accessible to the allocator. Once the report function has been completed +the pages will be returned to the free area from which they were obtained. + +Prior to removing a driver that is making use of free page reporting it +is necessary to call page_reporting_unregister to have the +page_reporting_dev_info structure that is currently in use by free page +reporting removed. Doing this will prevent further reports from being +issued via the interface. If another driver or the same driver is +registered it is possible for it to resume where the previous driver had +left off in terms of reporting free pages. + +Alexander Duyck, Dec 04, 2019 diff --git a/Documentation/vm/zswap.rst b/Documentation/vm/zswap.rst index 61f6185188cd207c9fc7b8a29fa25190216ac108..f8c6a79d7c701e043cb4f2798db9c94aaccf9ec5 100644 --- a/Documentation/vm/zswap.rst +++ b/Documentation/vm/zswap.rst @@ -35,9 +35,11 @@ Zswap evicts pages from compressed cache on an LRU basis to the backing swap device when the compressed pool reaches its size limit. This requirement had been identified in prior community discussions. -Zswap is disabled by default but can be enabled at boot time by setting -the ``enabled`` attribute to 1 at boot time. ie: ``zswap.enabled=1``. Zswap -can also be enabled and disabled at runtime using the sysfs interface. +Whether Zswap is enabled at the boot time depends on whether +the ``CONFIG_ZSWAP_DEFAULT_ON`` Kconfig option is enabled or not. +This setting can then be overridden by providing the kernel command line +``zswap.enabled=`` option, for example ``zswap.enabled=0``. +Zswap can also be enabled and disabled at runtime using the sysfs interface. An example command to enable zswap at runtime, assuming sysfs is mounted at ``/sys``, is:: @@ -64,9 +66,10 @@ allocation in zpool is not directly accessible by address. Rather, a handle is returned by the allocation routine and that handle must be mapped before being accessed. The compressed memory pool grows on demand and shrinks as compressed pages are freed. The pool is not preallocated. By default, a zpool -of type zbud is created, but it can be selected at boot time by -setting the ``zpool`` attribute, e.g. ``zswap.zpool=zbud``. It can -also be changed at runtime using the sysfs ``zpool`` attribute, e.g.:: +of type selected in ``CONFIG_ZSWAP_ZPOOL_DEFAULT`` Kconfig option is created, +but it can be overridden at boot time by setting the ``zpool`` attribute, +e.g. ``zswap.zpool=zbud``. It can also be changed at runtime using the sysfs +``zpool`` attribute, e.g.:: echo zbud > /sys/module/zswap/parameters/zpool @@ -97,8 +100,9 @@ controlled policy: * max_pool_percent - The maximum percentage of memory that the compressed pool can occupy. -The default compressor is lzo, but it can be selected at boot time by -setting the ``compressor`` attribute, e.g. ``zswap.compressor=lzo``. +The default compressor is selected in ``CONFIG_ZSWAP_COMPRESSOR_DEFAULT`` +Kconfig option, but it can be overridden at boot time by setting the +``compressor`` attribute, e.g. ``zswap.compressor=lzo``. It can also be changed at runtime using the sysfs "compressor" attribute, e.g.:: diff --git a/MAINTAINERS b/MAINTAINERS index 534a8dc4f84a6e733b0b11da584e4e93a50d3be0..9271068bde639b1cf354cf2bffd1f44c222395e3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -77,21 +77,13 @@ Tips for patch submitters 8. Happy hacking. -Descriptions of section entries -------------------------------- +Descriptions of section entries and preferred order +--------------------------------------------------- M: *Mail* patches to: FullName <address@domain> R: Designated *Reviewer*: FullName <address@domain> These reviewers should be CCed on patches. L: *Mailing list* that is relevant to this area - W: *Web-page* with status/info - B: URI for where to file *bugs*. A web-page with detailed bug - filing info, a direct bug tracker link, or a mailto: URI. - C: URI for *chat* protocol, server and channel where developers - usually hang out, for example irc://server/channel. - Q: *Patchwork* web based patch tracking system site - T: *SCM* tree type and location. - Type is one of: git, hg, quilt, stgit, topgit S: *Status*, one of the following: Supported: Someone is actually paid to look after this. Maintained: Someone actually looks after it. @@ -102,30 +94,39 @@ Descriptions of section entries Obsolete: Old code. Something tagged obsolete generally means it has been replaced by a better system and you should be using that. + W: *Web-page* with status/info + Q: *Patchwork* web based patch tracking system site + B: URI for where to file *bugs*. A web-page with detailed bug + filing info, a direct bug tracker link, or a mailto: URI. + C: URI for *chat* protocol, server and channel where developers + usually hang out, for example irc://server/channel. P: Subsystem Profile document for more details submitting patches to the given subsystem. This is either an in-tree file, or a URI. See Documentation/maintainer/maintainer-entry-profile.rst for details. + T: *SCM* tree type and location. + Type is one of: git, hg, quilt, stgit, topgit F: *Files* and directories wildcard patterns. A trailing slash includes all files and subdirectory files. F: drivers/net/ all files in and below drivers/net F: drivers/net/* all files in drivers/net, but not below F: */net/* all files in "any top level directory"/net One pattern per line. Multiple F: lines acceptable. + X: *Excluded* files and directories that are NOT maintained, same + rules as F:. Files exclusions are tested before file matches. + Can be useful for excluding a specific subdirectory, for instance: + F: net/ + X: net/ipv6/ + matches all files in and below net excluding net/ipv6/ N: Files and directories *Regex* patterns. - N: [^a-z]tegra all files whose path contains the word tegra + N: [^a-z]tegra all files whose path contains tegra + (not including files like integrator) One pattern per line. Multiple N: lines acceptable. scripts/get_maintainer.pl has different behavior for files that match F: pattern and matches of N: patterns. By default, get_maintainer will not look at git log history when an F: pattern match occurs. When an N: match occurs, git log history is used to also notify the people that have git commit signatures. - X: *Excluded* files and directories that are NOT maintained, same - rules as F:. Files exclusions are tested before file matches. - Can be useful for excluding a specific subdirectory, for instance: - F: net/ - X: net/ipv6/ - matches all files in and below net excluding net/ipv6/ K: *Content regex* (perl extended) pattern match in a patch or file. For instance: K: of_get_profile diff --git a/arch/alpha/include/asm/mmzone.h b/arch/alpha/include/asm/mmzone.h index 7ee144f484f1fa90d1ec9d754fbf632a4179d0dd..9b521c857436785345f73fd97660c749baa547b8 100644 --- a/arch/alpha/include/asm/mmzone.h +++ b/arch/alpha/include/asm/mmzone.h @@ -8,8 +8,6 @@ #include <asm/smp.h> -struct bootmem_data_t; /* stupid forward decl. */ - /* * Following are macros that are specific to this numa platform. */ diff --git a/arch/alpha/kernel/syscalls/syscallhdr.sh b/arch/alpha/kernel/syscalls/syscallhdr.sh index e5b99bd2e5e7fe32955b6aa29350d988d9ca6068..1780e861492ac16bcbb4a116f1499f7bacd4050f 100644 --- a/arch/alpha/kernel/syscalls/syscallhdr.sh +++ b/arch/alpha/kernel/syscalls/syscallhdr.sh @@ -32,5 +32,5 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( printf "#define __NR_syscalls\t%s\n" "${nxt}" printf "#endif\n" printf "\n" - printf "#endif /* %s */" "${fileguard}" + printf "#endif /* %s */\n" "${fileguard}" ) > "$out" diff --git a/arch/csky/mm/fault.c b/arch/csky/mm/fault.c index d3c61b83e1951121ac88556c5d66aa18336367c4..4e6dc68f325848d093cc5866a84b809f0a3d4a2d 100644 --- a/arch/csky/mm/fault.c +++ b/arch/csky/mm/fault.c @@ -141,7 +141,7 @@ good_area: if (!(vma->vm_flags & VM_WRITE)) goto bad_area; } else { - if (!(vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))) + if (unlikely(!vma_is_accessible(vma))) goto bad_area; } diff --git a/arch/ia64/kernel/syscalls/syscallhdr.sh b/arch/ia64/kernel/syscalls/syscallhdr.sh index 0c2d2c748565a01df6b42f87fcbeee8da2eeeb37..f407b6e53283607bd8997438eaba6062b874d342 100644 --- a/arch/ia64/kernel/syscalls/syscallhdr.sh +++ b/arch/ia64/kernel/syscalls/syscallhdr.sh @@ -32,5 +32,5 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( printf "#define __NR_syscalls\t%s\n" "${nxt}" printf "#endif\n" printf "\n" - printf "#endif /* %s */" "${fileguard}" + printf "#endif /* %s */\n" "${fileguard}" ) > "$out" diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S index 1ec6b703c5b4f75a616eddf6cc9c3c5e1566c3b9..6b5652ee76f96f4085955203dfcf0cb2535ce44c 100644 --- a/arch/ia64/kernel/vmlinux.lds.S +++ b/arch/ia64/kernel/vmlinux.lds.S @@ -54,6 +54,8 @@ SECTIONS { CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT + IRQENTRY_TEXT + SOFTIRQENTRY_TEXT *(.gnu.linkonce.t*) } diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c index f7afb9897966a8d99bff239f7bca3642c958a4bb..3bfb5c8ac3c7a67d3da2cfda79affb4df0f2ce5b 100644 --- a/arch/m68k/mm/fault.c +++ b/arch/m68k/mm/fault.c @@ -125,7 +125,7 @@ good_area: case 1: /* read, present */ goto acc_err; case 0: /* read, not present */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) + if (unlikely(!vma_is_accessible(vma))) goto acc_err; } diff --git a/arch/microblaze/kernel/syscalls/syscallhdr.sh b/arch/microblaze/kernel/syscalls/syscallhdr.sh index 2e9062a926a3a3d591aa93d232aec8ad378c66b0..a914854f8d9f70f770389089f38bbb52b764f750 100644 --- a/arch/microblaze/kernel/syscalls/syscallhdr.sh +++ b/arch/microblaze/kernel/syscalls/syscallhdr.sh @@ -32,5 +32,5 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( printf "#define __NR_syscalls\t%s\n" "${nxt}" printf "#endif\n" printf "\n" - printf "#endif /* %s */" "${fileguard}" + printf "#endif /* %s */\n" "${fileguard}" ) > "$out" diff --git a/arch/mips/kernel/syscalls/syscallhdr.sh b/arch/mips/kernel/syscalls/syscallhdr.sh index d2bcfa8f4d1a568061e99a0278ffc9f27dabffe8..2e241e713a7d0f0bd50de17a41b2eebd74fd1136 100644 --- a/arch/mips/kernel/syscalls/syscallhdr.sh +++ b/arch/mips/kernel/syscalls/syscallhdr.sh @@ -32,6 +32,5 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( printf "#define __NR_syscalls\t%s\n" "${nxt}" printf "#endif\n" printf "\n" - printf "#endif /* %s */" "${fileguard}" - printf "\n" + printf "#endif /* %s */\n" "${fileguard}" ) > "$out" diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index 4a0eafe3d932b1effeb5bf968087d704d27cf6e7..f8d62cd83b36ce3d3729388789af30937dfb5c3c 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c @@ -142,7 +142,7 @@ good_area: goto bad_area; } } else { - if (!(vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))) + if (unlikely(!vma_is_accessible(vma))) goto bad_area; } } diff --git a/arch/nds32/kernel/vmlinux.lds.S b/arch/nds32/kernel/vmlinux.lds.S index f679d33974366205b7d39d25de03311127bcac07..7a6c1cefe3fe5cd61043e0dee72197b170f0a1c5 100644 --- a/arch/nds32/kernel/vmlinux.lds.S +++ b/arch/nds32/kernel/vmlinux.lds.S @@ -47,6 +47,7 @@ SECTIONS LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT + SOFTIRQENTRY_TEXT *(.fixup) } diff --git a/arch/parisc/kernel/syscalls/syscallhdr.sh b/arch/parisc/kernel/syscalls/syscallhdr.sh index 50242b747d7ce7c39b63eea2bd423f2a795b20de..730db288fe5417c90e24e51a042daa5bd4290507 100644 --- a/arch/parisc/kernel/syscalls/syscallhdr.sh +++ b/arch/parisc/kernel/syscalls/syscallhdr.sh @@ -32,5 +32,5 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( printf "#define __NR_syscalls\t%s\n" "${nxt}" printf "#endif\n" printf "\n" - printf "#endif /* %s */" "${fileguard}" + printf "#endif /* %s */\n" "${fileguard}" ) > "$out" diff --git a/arch/powerpc/kernel/syscalls/syscallhdr.sh b/arch/powerpc/kernel/syscalls/syscallhdr.sh index c0a9a32937f11af7b13c08de6768a466d76e3977..02d6751f3be36c66c01e16959e9bbaef72defcf6 100644 --- a/arch/powerpc/kernel/syscalls/syscallhdr.sh +++ b/arch/powerpc/kernel/syscalls/syscallhdr.sh @@ -32,6 +32,5 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( printf "#define __NR_syscalls\t%s\n" "${nxt}" printf "#endif\n" printf "\n" - printf "#endif /* %s */" "${fileguard}" - printf "\n" + printf "#endif /* %s */\n" "${fileguard}" ) > "$out" diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index 425d1380664573db18f35c8d66289cb848025716..df9989cf7ba3f3a537743641ad04da16019833bd 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -422,7 +422,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, break; } } else if (vma && hva >= vma->vm_start && - (vma->vm_flags & VM_HUGETLB)) { + is_vm_hugetlb_page(vma)) { unsigned long psize = vma_kernel_pagesize(vma); tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >> diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index d15f0f0ee80630d1e88ee664b2a04b51bab0d24d..84af6c8eecf7113e2aa49d3e5864ac4eab727687 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -314,7 +314,7 @@ static bool access_error(bool is_write, bool is_exec, return false; } - if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) + if (unlikely(!vma_is_accessible(vma))) return true; /* * We should ideally do the vma pkey access check here. But in the diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c index d6d64f8718e6f3369c51ba27c740934178757cad..13b369d2cc454b7b3995c1ddc191a0e0599ce7cb 100644 --- a/arch/powerpc/platforms/powernv/memtrace.c +++ b/arch/powerpc/platforms/powernv/memtrace.c @@ -231,16 +231,10 @@ static int memtrace_online(void) continue; } - /* - * If kernel isn't compiled with the auto online option - * we need to online the memory ourselves. - */ - if (!memhp_auto_online) { - lock_device_hotplug(); - walk_memory_blocks(ent->start, ent->size, NULL, - online_mem_block); - unlock_device_hotplug(); - } + lock_device_hotplug(); + walk_memory_blocks(ent->start, ent->size, NULL, + online_mem_block); + unlock_device_hotplug(); /* * Memory was added successfully so clean up references to it diff --git a/arch/sh/kernel/syscalls/syscallhdr.sh b/arch/sh/kernel/syscalls/syscallhdr.sh index 1de0334e577f6619ea95693cd339df82d3d70560..4c0519861e97084b3b06e70138cf61b326724d0d 100644 --- a/arch/sh/kernel/syscalls/syscallhdr.sh +++ b/arch/sh/kernel/syscalls/syscallhdr.sh @@ -32,5 +32,5 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( printf "#define __NR_syscalls\t%s\n" "${nxt}" printf "#endif\n" printf "\n" - printf "#endif /* %s */" "${fileguard}" + printf "#endif /* %s */\n" "${fileguard}" ) > "$out" diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index 13ee4d20e6226bf4640305df486441650225a32c..5f23d790759766e1a11a6c5e8cacd0f5d0c51952 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c @@ -355,7 +355,7 @@ static inline int access_error(int error_code, struct vm_area_struct *vma) return 1; /* read, not present: */ - if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) + if (unlikely(!vma_is_accessible(vma))) return 1; return 0; diff --git a/arch/sparc/kernel/syscalls/syscallhdr.sh b/arch/sparc/kernel/syscalls/syscallhdr.sh index 626b5740a9f135f786c21addb1bdd532ce430434..cf50a75cc0bb286e3cedfa25b58baf3009f95a17 100644 --- a/arch/sparc/kernel/syscalls/syscallhdr.sh +++ b/arch/sparc/kernel/syscalls/syscallhdr.sh @@ -32,5 +32,5 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( printf "#define __NR_syscalls\t%s\n" "${nxt}" printf "#endif\n" printf "\n" - printf "#endif /* %s */" "${fileguard}" + printf "#endif /* %s */\n" "${fileguard}" ) > "$out" diff --git a/arch/sparc/vdso/vdso32/vclock_gettime.c b/arch/sparc/vdso/vdso32/vclock_gettime.c index 026abb3b826cc7bd600268d8609df43d0ce4232d..d7f99e6745ea9a0ef2a15eaa6b7d97762eacdc97 100644 --- a/arch/sparc/vdso/vdso32/vclock_gettime.c +++ b/arch/sparc/vdso/vdso32/vclock_gettime.c @@ -4,10 +4,6 @@ #define BUILD_VDSO32 -#ifndef CONFIG_CC_OPTIMIZE_FOR_SIZE -#undef CONFIG_OPTIMIZE_INLINING -#endif - #ifdef CONFIG_SPARC64 /* diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1edf788d301c3ff9f24396f9155ca461de631c5d..8d078642b4bee9d639ec12547e6f08bacdd8ff96 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -149,6 +149,7 @@ config X86 select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64 + select HAVE_ARCH_USERFAULTFD_WP if USERFAULTFD select HAVE_ARCH_VMAP_STACK if X86_64 select HAVE_ARCH_WITHIN_STACK_FRAMES select HAVE_ASM_MODVERSIONS diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index ab8b30cb978e33757bef1644aaf079bd42b2e211..550904591e94d85423086102f5a8ded9fa984aa5 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -285,7 +285,6 @@ CONFIG_EARLY_PRINTK_DBGP=y CONFIG_DEBUG_STACKOVERFLOW=y # CONFIG_DEBUG_RODATA_TEST is not set CONFIG_DEBUG_BOOT_PARAMS=y -CONFIG_OPTIMIZE_INLINING=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_SELINUX=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 2d196cb49084770c90fd81238cb1501ab53ec3b0..614961009075165e4146e268953e541c9a349f24 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -282,7 +282,6 @@ CONFIG_EARLY_PRINTK_DBGP=y CONFIG_DEBUG_STACKOVERFLOW=y # CONFIG_DEBUG_RODATA_TEST is not set CONFIG_DEBUG_BOOT_PARAMS=y -CONFIG_OPTIMIZE_INLINING=y CONFIG_UNWINDER_ORC=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y diff --git a/arch/x86/entry/vdso/vdso32/vclock_gettime.c b/arch/x86/entry/vdso/vdso32/vclock_gettime.c index 1e82bd43286cac91089380fcbb6d823e18cc49bf..84a4a73f77f793c116b0f39f5b729f33487ca86c 100644 --- a/arch/x86/entry/vdso/vdso32/vclock_gettime.c +++ b/arch/x86/entry/vdso/vdso32/vclock_gettime.c @@ -1,10 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #define BUILD_VDSO32 -#ifndef CONFIG_CC_OPTIMIZE_FOR_SIZE -#undef CONFIG_OPTIMIZE_INLINING -#endif - #ifdef CONFIG_X86_64 /* diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index afda66a6d3258437393c07c376389016c4590cc2..28838d79019156db24781e33a085cfd2dd693d33 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -25,6 +25,7 @@ #include <asm/x86_init.h> #include <asm/fpu/xstate.h> #include <asm/fpu/api.h> +#include <asm-generic/pgtable_uffd.h> extern pgd_t early_top_pgt[PTRS_PER_PGD]; int __init __early_make_pgtable(unsigned long address, pmdval_t pmd); @@ -313,6 +314,23 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear) return native_make_pte(v & ~clear); } +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP +static inline int pte_uffd_wp(pte_t pte) +{ + return pte_flags(pte) & _PAGE_UFFD_WP; +} + +static inline pte_t pte_mkuffd_wp(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_UFFD_WP); +} + +static inline pte_t pte_clear_uffd_wp(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_UFFD_WP); +} +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ + static inline pte_t pte_mkclean(pte_t pte) { return pte_clear_flags(pte, _PAGE_DIRTY); @@ -392,6 +410,23 @@ static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) return native_make_pmd(v & ~clear); } +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP +static inline int pmd_uffd_wp(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_UFFD_WP; +} + +static inline pmd_t pmd_mkuffd_wp(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_UFFD_WP); +} + +static inline pmd_t pmd_clear_uffd_wp(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_UFFD_WP); +} +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ + static inline pmd_t pmd_mkold(pmd_t pmd) { return pmd_clear_flags(pmd, _PAGE_ACCESSED); @@ -1374,6 +1409,38 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) #endif #endif +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP +static inline pte_t pte_swp_mkuffd_wp(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_SWP_UFFD_WP); +} + +static inline int pte_swp_uffd_wp(pte_t pte) +{ + return pte_flags(pte) & _PAGE_SWP_UFFD_WP; +} + +static inline pte_t pte_swp_clear_uffd_wp(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_SWP_UFFD_WP); +} + +static inline pmd_t pmd_swp_mkuffd_wp(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_SWP_UFFD_WP); +} + +static inline int pmd_swp_uffd_wp(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_SWP_UFFD_WP; +} + +static inline pmd_t pmd_swp_clear_uffd_wp(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_SWP_UFFD_WP); +} +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ + #define PKRU_AD_BIT 0x1 #define PKRU_WD_BIT 0x2 #define PKRU_BITS_PER_PKEY 2 diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 0b6c4042942a22247c68e9174622157095f62495..df1373415f11b758184ef917c3417e50f34fd4fe 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -189,7 +189,7 @@ extern void sync_global_pgds(unsigned long start, unsigned long end); * * | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names - * | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| X|X|SD|0| <- swp entry + * | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| X|F|SD|0| <- swp entry * * G (8) is aliased and used as a PROT_NONE indicator for * !present ptes. We need to start storing swap entries above @@ -197,9 +197,15 @@ extern void sync_global_pgds(unsigned long start, unsigned long end); * erratum where they can be incorrectly set by hardware on * non-present PTEs. * + * SD Bits 1-4 are not used in non-present format and available for + * special use described below: + * * SD (1) in swp entry is used to store soft dirty bit, which helps us * remember soft dirty over page migration * + * F (2) in swp entry is used to record when a pagetable is + * writeprotected by userfaultfd WP support. + * * Bit 7 in swp entry should be 0 because pmd_present checks not only P, * but also L and G. * diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 65c2ecd730c5b6db6b7481d20255f9dbeda2dcb3..b6606fe6cfdf363a12f76e3e8aef16e52fabba4f 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -32,6 +32,7 @@ #define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1 #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 +#define _PAGE_BIT_UFFD_WP _PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */ #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ #define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4 @@ -100,6 +101,14 @@ #define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0)) #endif +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP +#define _PAGE_UFFD_WP (_AT(pteval_t, 1) << _PAGE_BIT_UFFD_WP) +#define _PAGE_SWP_UFFD_WP _PAGE_USER +#else +#define _PAGE_UFFD_WP (_AT(pteval_t, 0)) +#define _PAGE_SWP_UFFD_WP (_AT(pteval_t, 0)) +#endif + #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) #define _PAGE_DEVMAP (_AT(u64, 1) << _PAGE_BIT_DEVMAP) @@ -118,7 +127,8 @@ */ #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ - _PAGE_SOFT_DIRTY | _PAGE_DEVMAP | _PAGE_ENC) + _PAGE_SOFT_DIRTY | _PAGE_DEVMAP | _PAGE_ENC | \ + _PAGE_UFFD_WP) #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) /* diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 859519f5b3423469ac735972fd26934dbab74cfe..a51df516b87bf1e174e2170b2c642406d2f28633 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1222,7 +1222,7 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) return 1; /* read, not present: */ - if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) + if (unlikely(!vma_is_accessible(vma))) return 1; return 0; diff --git a/arch/xtensa/kernel/syscalls/syscallhdr.sh b/arch/xtensa/kernel/syscalls/syscallhdr.sh index d37db641ca31ab446e9b59180b784566acbf09bc..eebfb8a8ace6349da1cab618b260ee23f38df891 100644 --- a/arch/xtensa/kernel/syscalls/syscallhdr.sh +++ b/arch/xtensa/kernel/syscalls/syscallhdr.sh @@ -32,5 +32,5 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( printf "#define __NR_syscalls\t%s\n" "${nxt}" printf "#endif\n" printf "\n" - printf "#endif /* %s */" "${fileguard}" + printf "#endif /* %s */\n" "${fileguard}" ) > "$out" diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 4086718f6876ff4f3f1e9c897a14985b9bf21299..dbec3a05590a5163da41494e2b434808688afbe7 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -27,6 +27,24 @@ #define MEMORY_CLASS_NAME "memory" +static const char *const online_type_to_str[] = { + [MMOP_OFFLINE] = "offline", + [MMOP_ONLINE] = "online", + [MMOP_ONLINE_KERNEL] = "online_kernel", + [MMOP_ONLINE_MOVABLE] = "online_movable", +}; + +int memhp_online_type_from_str(const char *str) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) { + if (sysfs_streq(str, online_type_to_str[i])) + return i; + } + return -EINVAL; +} + #define to_memory_block(dev) container_of(dev, struct memory_block, dev) static int sections_per_block; @@ -144,45 +162,6 @@ int memory_notify(unsigned long val, void *v) return blocking_notifier_call_chain(&memory_chain, val, v); } -/* - * The probe routines leave the pages uninitialized, just as the bootmem code - * does. Make sure we do not access them, but instead use only information from - * within sections. - */ -static bool pages_correctly_probed(unsigned long start_pfn) -{ - unsigned long section_nr = pfn_to_section_nr(start_pfn); - unsigned long section_nr_end = section_nr + sections_per_block; - unsigned long pfn = start_pfn; - - /* - * memmap between sections is not contiguous except with - * SPARSEMEM_VMEMMAP. We lookup the page once per section - * and assume memmap is contiguous within each section - */ - for (; section_nr < section_nr_end; section_nr++) { - if (WARN_ON_ONCE(!pfn_valid(pfn))) - return false; - - if (!present_section_nr(section_nr)) { - pr_warn("section %ld pfn[%lx, %lx) not present\n", - section_nr, pfn, pfn + PAGES_PER_SECTION); - return false; - } else if (!valid_section_nr(section_nr)) { - pr_warn("section %ld pfn[%lx, %lx) no valid memmap\n", - section_nr, pfn, pfn + PAGES_PER_SECTION); - return false; - } else if (online_section_nr(section_nr)) { - pr_warn("section %ld pfn[%lx, %lx) is already online\n", - section_nr, pfn, pfn + PAGES_PER_SECTION); - return false; - } - pfn += PAGES_PER_SECTION; - } - - return true; -} - /* * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is * OK to have direct references to sparsemem variables in here. @@ -199,9 +178,6 @@ memory_block_action(unsigned long start_section_nr, unsigned long action, switch (action) { case MEM_ONLINE: - if (!pages_correctly_probed(start_pfn)) - return -EBUSY; - ret = online_pages(start_pfn, nr_pages, online_type, nid); break; case MEM_OFFLINE: @@ -245,17 +221,14 @@ static int memory_subsys_online(struct device *dev) return 0; /* - * If we are called from state_store(), online_type will be - * set >= 0 Otherwise we were called from the device online - * attribute and need to set the online_type. + * When called via device_online() without configuring the online_type, + * we want to default to MMOP_ONLINE. */ - if (mem->online_type < 0) - mem->online_type = MMOP_ONLINE_KEEP; + if (mem->online_type == MMOP_OFFLINE) + mem->online_type = MMOP_ONLINE; ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); - - /* clear online_type */ - mem->online_type = -1; + mem->online_type = MMOP_OFFLINE; return ret; } @@ -267,40 +240,27 @@ static int memory_subsys_offline(struct device *dev) if (mem->state == MEM_OFFLINE) return 0; - /* Can't offline block with non-present sections */ - if (mem->section_count != sections_per_block) - return -EINVAL; - return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); } static ssize_t state_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { + const int online_type = memhp_online_type_from_str(buf); struct memory_block *mem = to_memory_block(dev); - int ret, online_type; + int ret; + + if (online_type < 0) + return -EINVAL; ret = lock_device_hotplug_sysfs(); if (ret) return ret; - if (sysfs_streq(buf, "online_kernel")) - online_type = MMOP_ONLINE_KERNEL; - else if (sysfs_streq(buf, "online_movable")) - online_type = MMOP_ONLINE_MOVABLE; - else if (sysfs_streq(buf, "online")) - online_type = MMOP_ONLINE_KEEP; - else if (sysfs_streq(buf, "offline")) - online_type = MMOP_OFFLINE; - else { - ret = -EINVAL; - goto err; - } - switch (online_type) { case MMOP_ONLINE_KERNEL: case MMOP_ONLINE_MOVABLE: - case MMOP_ONLINE_KEEP: + case MMOP_ONLINE: /* mem->online_type is protected by device_hotplug_lock */ mem->online_type = online_type; ret = device_online(&mem->dev); @@ -312,7 +272,6 @@ static ssize_t state_store(struct device *dev, struct device_attribute *attr, ret = -EINVAL; /* should never happen */ } -err: unlock_device_hotplug(); if (ret < 0) @@ -380,7 +339,8 @@ static ssize_t valid_zones_show(struct device *dev, } nid = mem->nid; - default_zone = zone_for_pfn_range(MMOP_ONLINE_KEEP, nid, start_pfn, nr_pages); + default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, start_pfn, + nr_pages); strcat(buf, default_zone->name); print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL, @@ -418,23 +378,20 @@ static DEVICE_ATTR_RO(block_size_bytes); static ssize_t auto_online_blocks_show(struct device *dev, struct device_attribute *attr, char *buf) { - if (memhp_auto_online) - return sprintf(buf, "online\n"); - else - return sprintf(buf, "offline\n"); + return sprintf(buf, "%s\n", + online_type_to_str[memhp_default_online_type]); } static ssize_t auto_online_blocks_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - if (sysfs_streq(buf, "online")) - memhp_auto_online = true; - else if (sysfs_streq(buf, "offline")) - memhp_auto_online = false; - else + const int online_type = memhp_online_type_from_str(buf); + + if (online_type < 0) return -EINVAL; + memhp_default_online_type = online_type; return count; } @@ -627,7 +584,7 @@ static int init_memory_block(struct memory_block **memory, static int add_memory_block(unsigned long base_section_nr) { - int ret, section_count = 0; + int section_count = 0; struct memory_block *mem; unsigned long nr; @@ -638,12 +595,8 @@ static int add_memory_block(unsigned long base_section_nr) if (section_count == 0) return 0; - ret = init_memory_block(&mem, base_memory_block_id(base_section_nr), - MEM_ONLINE); - if (ret) - return ret; - mem->section_count = section_count; - return 0; + return init_memory_block(&mem, base_memory_block_id(base_section_nr), + MEM_ONLINE); } static void unregister_memory(struct memory_block *memory) @@ -679,7 +632,6 @@ int create_memory_block_devices(unsigned long start, unsigned long size) ret = init_memory_block(&mem, block_id, MEM_OFFLINE); if (ret) break; - mem->section_count = sections_per_block; } if (ret) { end_block_id = block_id; @@ -688,7 +640,6 @@ int create_memory_block_devices(unsigned long start, unsigned long size) mem = find_memory_block_by_id(block_id); if (WARN_ON_ONCE(!mem)) continue; - mem->section_count = 0; unregister_memory(mem); } } @@ -717,7 +668,6 @@ void remove_memory_block_devices(unsigned long start, unsigned long size) mem = find_memory_block_by_id(block_id); if (WARN_ON_ONCE(!mem)) continue; - mem->section_count = 0; unregister_memory_block_under_nodes(mem); unregister_memory(mem); } diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index a02ce43d778da22dd5e6cee60ce38dfd4fedc38b..32e3bc0aa665a1ba6e0c45377d94d3ad9f7e2f30 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -533,7 +533,6 @@ struct hv_dynmem_device { * State to synchronize hot-add. */ struct completion ol_waitevent; - bool ha_waiting; /* * This thread handles hot-add * requests from the host as well as notifying @@ -634,10 +633,7 @@ static int hv_memory_notifier(struct notifier_block *nb, unsigned long val, switch (val) { case MEM_ONLINE: case MEM_CANCEL_ONLINE: - if (dm_device.ha_waiting) { - dm_device.ha_waiting = false; - complete(&dm_device.ol_waitevent); - } + complete(&dm_device.ol_waitevent); break; case MEM_OFFLINE: @@ -726,8 +722,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, has->covered_end_pfn += processed_pfn; spin_unlock_irqrestore(&dm_device.ha_lock, flags); - init_completion(&dm_device.ol_waitevent); - dm_device.ha_waiting = !memhp_auto_online; + reinit_completion(&dm_device.ol_waitevent); nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn)); ret = add_memory(nid, PFN_PHYS((start_pfn)), @@ -753,15 +748,14 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, } /* - * Wait for the memory block to be onlined when memory onlining - * is done outside of kernel (memhp_auto_online). Since the hot - * add has succeeded, it is ok to proceed even if the pages in - * the hot added region have not been "onlined" within the - * allowed time. + * Wait for memory to get onlined. If the kernel onlined the + * memory when adding it, this will return directly. Otherwise, + * it will wait for user space to online the memory. This helps + * to avoid adding memory faster than it is getting onlined. As + * adding succeeded, it is ok to proceed even if the memory was + * not onlined in time. */ - if (dm_device.ha_waiting) - wait_for_completion_timeout(&dm_device.ol_waitevent, - 5*HZ); + wait_for_completion_timeout(&dm_device.ol_waitevent, 5 * HZ); post_status(&dm_device); } } @@ -1706,6 +1700,7 @@ static int balloon_probe(struct hv_device *dev, #ifdef CONFIG_MEMORY_HOTPLUG set_online_page_callback(&hv_online_page); + init_completion(&dm_device.ol_waitevent); register_memory_notifier(&hv_memory_nb); #endif diff --git a/drivers/misc/lkdtm/bugs.c b/drivers/misc/lkdtm/bugs.c index cc92bc3ed8203761a44dc76fcaa277c05a31bfa6..886459e0ddd98c15ed2c61bc2e96483abc9adda5 100644 --- a/drivers/misc/lkdtm/bugs.c +++ b/drivers/misc/lkdtm/bugs.c @@ -11,6 +11,7 @@ #include <linux/sched/signal.h> #include <linux/sched/task_stack.h> #include <linux/uaccess.h> +#include <linux/slab.h> #ifdef CONFIG_X86_32 #include <asm/desc.h> @@ -175,6 +176,80 @@ void lkdtm_HUNG_TASK(void) schedule(); } +volatile unsigned int huge = INT_MAX - 2; +volatile unsigned int ignored; + +void lkdtm_OVERFLOW_SIGNED(void) +{ + int value; + + value = huge; + pr_info("Normal signed addition ...\n"); + value += 1; + ignored = value; + + pr_info("Overflowing signed addition ...\n"); + value += 4; + ignored = value; +} + + +void lkdtm_OVERFLOW_UNSIGNED(void) +{ + unsigned int value; + + value = huge; + pr_info("Normal unsigned addition ...\n"); + value += 1; + ignored = value; + + pr_info("Overflowing unsigned addition ...\n"); + value += 4; + ignored = value; +} + +/* Intentially using old-style flex array definition of 1 byte. */ +struct array_bounds_flex_array { + int one; + int two; + char data[1]; +}; + +struct array_bounds { + int one; + int two; + char data[8]; + int three; +}; + +void lkdtm_ARRAY_BOUNDS(void) +{ + struct array_bounds_flex_array *not_checked; + struct array_bounds *checked; + volatile int i; + + not_checked = kmalloc(sizeof(*not_checked) * 2, GFP_KERNEL); + checked = kmalloc(sizeof(*checked) * 2, GFP_KERNEL); + + pr_info("Array access within bounds ...\n"); + /* For both, touch all bytes in the actual member size. */ + for (i = 0; i < sizeof(checked->data); i++) + checked->data[i] = 'A'; + /* + * For the uninstrumented flex array member, also touch 1 byte + * beyond to verify it is correctly uninstrumented. + */ + for (i = 0; i < sizeof(not_checked->data) + 1; i++) + not_checked->data[i] = 'A'; + + pr_info("Array access beyond bounds ...\n"); + for (i = 0; i < sizeof(checked->data) + 1; i++) + checked->data[i] = 'B'; + + kfree(not_checked); + kfree(checked); +} + void lkdtm_CORRUPT_LIST_ADD(void) { /* diff --git a/drivers/misc/lkdtm/core.c b/drivers/misc/lkdtm/core.c index 5ce4ac8c06fc049204a0baa4cf59921c6ae258ec..a5e344df91663294e4e205c0a04e898956fb697f 100644 --- a/drivers/misc/lkdtm/core.c +++ b/drivers/misc/lkdtm/core.c @@ -130,6 +130,9 @@ static const struct crashtype crashtypes[] = { CRASHTYPE(HARDLOCKUP), CRASHTYPE(SPINLOCKUP), CRASHTYPE(HUNG_TASK), + CRASHTYPE(OVERFLOW_SIGNED), + CRASHTYPE(OVERFLOW_UNSIGNED), + CRASHTYPE(ARRAY_BOUNDS), CRASHTYPE(EXEC_DATA), CRASHTYPE(EXEC_STACK), CRASHTYPE(EXEC_KMALLOC), diff --git a/drivers/misc/lkdtm/lkdtm.h b/drivers/misc/lkdtm/lkdtm.h index 8d13d01766247f1f56bb953445ac75746b5aa0bd..601a2156a0d48ccf417421282487ba9b36b1070b 100644 --- a/drivers/misc/lkdtm/lkdtm.h +++ b/drivers/misc/lkdtm/lkdtm.h @@ -22,6 +22,9 @@ void lkdtm_SOFTLOCKUP(void); void lkdtm_HARDLOCKUP(void); void lkdtm_SPINLOCKUP(void); void lkdtm_HUNG_TASK(void); +void lkdtm_OVERFLOW_SIGNED(void); +void lkdtm_OVERFLOW_UNSIGNED(void); +void lkdtm_ARRAY_BOUNDS(void); void lkdtm_CORRUPT_LIST_ADD(void); void lkdtm_CORRUPT_LIST_DEL(void); void lkdtm_CORRUPT_USER_DS(void); diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index 078615cf2afcd5f818a5cd26af5137e38f8b6041..4b2dd8259ff515e5a4a6c3d5c59271a9f92062e0 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -58,6 +58,7 @@ config VIRTIO_BALLOON tristate "Virtio balloon driver" depends on VIRTIO select MEMORY_BALLOON + select PAGE_REPORTING ---help--- This driver supports increasing and decreasing the amount of memory within a KVM guest. diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 341458fd95ca4896e9eb4bd708473dc1df1ff012..0ef16566c3f3390936a3e3d97e99a1fb97f40193 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -14,11 +14,13 @@ #include <linux/slab.h> #include <linux/module.h> #include <linux/balloon_compaction.h> +#include <linux/oom.h> #include <linux/wait.h> #include <linux/mm.h> #include <linux/mount.h> #include <linux/magic.h> #include <linux/pseudo_fs.h> +#include <linux/page_reporting.h> /* * Balloon device works in 4K page units. So each page is pointed to by @@ -27,7 +29,9 @@ */ #define VIRTIO_BALLOON_PAGES_PER_PAGE (unsigned)(PAGE_SIZE >> VIRTIO_BALLOON_PFN_SHIFT) #define VIRTIO_BALLOON_ARRAY_PFNS_MAX 256 -#define VIRTBALLOON_OOM_NOTIFY_PRIORITY 80 +/* Maximum number of (4k) pages to deflate on OOM notifications. */ +#define VIRTIO_BALLOON_OOM_NR_PAGES 256 +#define VIRTIO_BALLOON_OOM_NOTIFY_PRIORITY 80 #define VIRTIO_BALLOON_FREE_PAGE_ALLOC_FLAG (__GFP_NORETRY | __GFP_NOWARN | \ __GFP_NOMEMALLOC) @@ -47,6 +51,7 @@ enum virtio_balloon_vq { VIRTIO_BALLOON_VQ_DEFLATE, VIRTIO_BALLOON_VQ_STATS, VIRTIO_BALLOON_VQ_FREE_PAGE, + VIRTIO_BALLOON_VQ_REPORTING, VIRTIO_BALLOON_VQ_MAX }; @@ -112,8 +117,15 @@ struct virtio_balloon { /* Memory statistics */ struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR]; - /* To register a shrinker to shrink memory upon memory pressure */ + /* Shrinker to return free pages - VIRTIO_BALLOON_F_FREE_PAGE_HINT */ struct shrinker shrinker; + + /* OOM notifier to deflate on OOM - VIRTIO_BALLOON_F_DEFLATE_ON_OOM */ + struct notifier_block oom_nb; + + /* Free page reporting device */ + struct virtqueue *reporting_vq; + struct page_reporting_dev_info pr_dev_info; }; static struct virtio_device_id id_table[] = { @@ -153,6 +165,33 @@ static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq) } +int virtballoon_free_page_report(struct page_reporting_dev_info *pr_dev_info, + struct scatterlist *sg, unsigned int nents) +{ + struct virtio_balloon *vb = + container_of(pr_dev_info, struct virtio_balloon, pr_dev_info); + struct virtqueue *vq = vb->reporting_vq; + unsigned int unused, err; + + /* We should always be able to add these buffers to an empty queue. */ + err = virtqueue_add_inbuf(vq, sg, nents, vb, GFP_NOWAIT | __GFP_NOWARN); + + /* + * In the extremely unlikely case that something has occurred and we + * are able to trigger an error we will simply display a warning + * and exit without actually processing the pages. + */ + if (WARN_ON_ONCE(err)) + return err; + + virtqueue_kick(vq); + + /* When host has read buffer, this completes via balloon_ack */ + wait_event(vb->acked, virtqueue_get_buf(vq, &unused)); + + return 0; +} + static void set_page_pfns(struct virtio_balloon *vb, __virtio32 pfns[], struct page *page) { @@ -481,6 +520,7 @@ static int init_vqs(struct virtio_balloon *vb) names[VIRTIO_BALLOON_VQ_STATS] = NULL; callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL; names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL; + names[VIRTIO_BALLOON_VQ_REPORTING] = NULL; if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) { names[VIRTIO_BALLOON_VQ_STATS] = "stats"; @@ -492,6 +532,11 @@ static int init_vqs(struct virtio_balloon *vb) callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL; } + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) { + names[VIRTIO_BALLOON_VQ_REPORTING] = "reporting_vq"; + callbacks[VIRTIO_BALLOON_VQ_REPORTING] = balloon_ack; + } + err = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX, vqs, callbacks, names, NULL, NULL); if (err) @@ -524,6 +569,9 @@ static int init_vqs(struct virtio_balloon *vb) if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) vb->free_page_vq = vqs[VIRTIO_BALLOON_VQ_FREE_PAGE]; + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) + vb->reporting_vq = vqs[VIRTIO_BALLOON_VQ_REPORTING]; + return 0; } @@ -788,50 +836,13 @@ static unsigned long shrink_free_pages(struct virtio_balloon *vb, return blocks_freed * VIRTIO_BALLOON_HINT_BLOCK_PAGES; } -static unsigned long leak_balloon_pages(struct virtio_balloon *vb, - unsigned long pages_to_free) -{ - return leak_balloon(vb, pages_to_free * VIRTIO_BALLOON_PAGES_PER_PAGE) / - VIRTIO_BALLOON_PAGES_PER_PAGE; -} - -static unsigned long shrink_balloon_pages(struct virtio_balloon *vb, - unsigned long pages_to_free) -{ - unsigned long pages_freed = 0; - - /* - * One invocation of leak_balloon can deflate at most - * VIRTIO_BALLOON_ARRAY_PFNS_MAX balloon pages, so we call it - * multiple times to deflate pages till reaching pages_to_free. - */ - while (vb->num_pages && pages_freed < pages_to_free) - pages_freed += leak_balloon_pages(vb, - pages_to_free - pages_freed); - - update_balloon_size(vb); - - return pages_freed; -} - static unsigned long virtio_balloon_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) { - unsigned long pages_to_free, pages_freed = 0; struct virtio_balloon *vb = container_of(shrinker, struct virtio_balloon, shrinker); - pages_to_free = sc->nr_to_scan; - - if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) - pages_freed = shrink_free_pages(vb, pages_to_free); - - if (pages_freed >= pages_to_free) - return pages_freed; - - pages_freed += shrink_balloon_pages(vb, pages_to_free - pages_freed); - - return pages_freed; + return shrink_free_pages(vb, sc->nr_to_scan); } static unsigned long virtio_balloon_shrinker_count(struct shrinker *shrinker, @@ -839,12 +850,22 @@ static unsigned long virtio_balloon_shrinker_count(struct shrinker *shrinker, { struct virtio_balloon *vb = container_of(shrinker, struct virtio_balloon, shrinker); - unsigned long count; - count = vb->num_pages / VIRTIO_BALLOON_PAGES_PER_PAGE; - count += vb->num_free_page_blocks * VIRTIO_BALLOON_HINT_BLOCK_PAGES; + return vb->num_free_page_blocks * VIRTIO_BALLOON_HINT_BLOCK_PAGES; +} + +static int virtio_balloon_oom_notify(struct notifier_block *nb, + unsigned long dummy, void *parm) +{ + struct virtio_balloon *vb = container_of(nb, + struct virtio_balloon, oom_nb); + unsigned long *freed = parm; + + *freed += leak_balloon(vb, VIRTIO_BALLOON_OOM_NR_PAGES) / + VIRTIO_BALLOON_PAGES_PER_PAGE; + update_balloon_size(vb); - return count; + return NOTIFY_OK; } static void virtio_balloon_unregister_shrinker(struct virtio_balloon *vb) @@ -864,7 +885,6 @@ static int virtio_balloon_register_shrinker(struct virtio_balloon *vb) static int virtballoon_probe(struct virtio_device *vdev) { struct virtio_balloon *vb; - __u32 poison_val; int err; if (!vdev->config->get) { @@ -930,27 +950,65 @@ static int virtballoon_probe(struct virtio_device *vdev) VIRTIO_BALLOON_CMD_ID_STOP); spin_lock_init(&vb->free_page_list_lock); INIT_LIST_HEAD(&vb->free_page_list); - if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON)) { - memset(&poison_val, PAGE_POISON, sizeof(poison_val)); - virtio_cwrite(vb->vdev, struct virtio_balloon_config, - poison_val, &poison_val); - } - } - /* - * We continue to use VIRTIO_BALLOON_F_DEFLATE_ON_OOM to decide if a - * shrinker needs to be registered to relieve memory pressure. - */ - if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) { + /* + * We're allowed to reuse any free pages, even if they are + * still to be processed by the host. + */ err = virtio_balloon_register_shrinker(vb); if (err) goto out_del_balloon_wq; } + + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) { + vb->oom_nb.notifier_call = virtio_balloon_oom_notify; + vb->oom_nb.priority = VIRTIO_BALLOON_OOM_NOTIFY_PRIORITY; + err = register_oom_notifier(&vb->oom_nb); + if (err < 0) + goto out_unregister_shrinker; + } + + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON)) { + /* Start with poison val of 0 representing general init */ + __u32 poison_val = 0; + + /* + * Let the hypervisor know that we are expecting a + * specific value to be written back in balloon pages. + */ + if (!want_init_on_free()) + memset(&poison_val, PAGE_POISON, sizeof(poison_val)); + + virtio_cwrite(vb->vdev, struct virtio_balloon_config, + poison_val, &poison_val); + } + + vb->pr_dev_info.report = virtballoon_free_page_report; + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) { + unsigned int capacity; + + capacity = virtqueue_get_vring_size(vb->reporting_vq); + if (capacity < PAGE_REPORTING_CAPACITY) { + err = -ENOSPC; + goto out_unregister_oom; + } + + err = page_reporting_register(&vb->pr_dev_info); + if (err) + goto out_unregister_oom; + } + virtio_device_ready(vdev); if (towards_target(vb)) virtballoon_changed(vdev); return 0; +out_unregister_oom: + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) + unregister_oom_notifier(&vb->oom_nb); +out_unregister_shrinker: + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) + virtio_balloon_unregister_shrinker(vb); out_del_balloon_wq: if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) destroy_workqueue(vb->balloon_wq); @@ -989,7 +1047,11 @@ static void virtballoon_remove(struct virtio_device *vdev) { struct virtio_balloon *vb = vdev->priv; + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) + page_reporting_unregister(&vb->pr_dev_info); if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) + unregister_oom_notifier(&vb->oom_nb); + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) virtio_balloon_unregister_shrinker(vb); spin_lock_irq(&vb->stop_update_lock); vb->stop_update = true; @@ -1045,7 +1107,10 @@ static int virtballoon_restore(struct virtio_device *vdev) static int virtballoon_validate(struct virtio_device *vdev) { - if (!page_poisoning_enabled()) + /* Tell the host whether we care about poisoned pages. */ + if (!want_init_on_free() && + (IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY) || + !page_poisoning_enabled())) __virtio_clear_bit(vdev, VIRTIO_BALLOON_F_PAGE_POISON); __virtio_clear_bit(vdev, VIRTIO_F_IOMMU_PLATFORM); @@ -1058,6 +1123,7 @@ static unsigned int features[] = { VIRTIO_BALLOON_F_DEFLATE_ON_OOM, VIRTIO_BALLOON_F_FREE_PAGE_HINT, VIRTIO_BALLOON_F_PAGE_POISON, + VIRTIO_BALLOON_F_REPORTING, }; static struct virtio_driver virtio_balloon_driver = { diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index f4713ea76e8274a5637d730274c0cb06dd7c96fa..13f25e241ac46cbd2f5ffa23de45e60a035a0c1a 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -27,6 +27,7 @@ #include <linux/highuid.h> #include <linux/compiler.h> #include <linux/highmem.h> +#include <linux/hugetlb.h> #include <linux/pagemap.h> #include <linux/vmalloc.h> #include <linux/security.h> @@ -698,19 +699,11 @@ static int load_elf_binary(struct linux_binprm *bprm) unsigned long reloc_func_desc __maybe_unused = 0; int executable_stack = EXSTACK_DEFAULT; struct elfhdr *elf_ex = (struct elfhdr *)bprm->buf; - struct { - struct elfhdr interp_elf_ex; - } *loc; + struct elfhdr *interp_elf_ex = NULL; struct arch_elf_state arch_state = INIT_ARCH_ELF_STATE; struct mm_struct *mm; struct pt_regs *regs; - loc = kmalloc(sizeof(*loc), GFP_KERNEL); - if (!loc) { - retval = -ENOMEM; - goto out_ret; - } - retval = -ENOEXEC; /* First of all, some simple consistency checks */ if (memcmp(elf_ex->e_ident, ELFMAG, SELFMAG) != 0) @@ -770,9 +763,15 @@ static int load_elf_binary(struct linux_binprm *bprm) */ would_dump(bprm, interpreter); + interp_elf_ex = kmalloc(sizeof(*interp_elf_ex), GFP_KERNEL); + if (!interp_elf_ex) { + retval = -ENOMEM; + goto out_free_ph; + } + /* Get the exec headers */ - retval = elf_read(interpreter, &loc->interp_elf_ex, - sizeof(loc->interp_elf_ex), 0); + retval = elf_read(interpreter, interp_elf_ex, + sizeof(*interp_elf_ex), 0); if (retval < 0) goto out_free_dentry; @@ -806,25 +805,25 @@ out_free_interp: if (interpreter) { retval = -ELIBBAD; /* Not an ELF interpreter */ - if (memcmp(loc->interp_elf_ex.e_ident, ELFMAG, SELFMAG) != 0) + if (memcmp(interp_elf_ex->e_ident, ELFMAG, SELFMAG) != 0) goto out_free_dentry; /* Verify the interpreter has a valid arch */ - if (!elf_check_arch(&loc->interp_elf_ex) || - elf_check_fdpic(&loc->interp_elf_ex)) + if (!elf_check_arch(interp_elf_ex) || + elf_check_fdpic(interp_elf_ex)) goto out_free_dentry; /* Load the interpreter program headers */ - interp_elf_phdata = load_elf_phdrs(&loc->interp_elf_ex, + interp_elf_phdata = load_elf_phdrs(interp_elf_ex, interpreter); if (!interp_elf_phdata) goto out_free_dentry; /* Pass PT_LOPROC..PT_HIPROC headers to arch code */ elf_ppnt = interp_elf_phdata; - for (i = 0; i < loc->interp_elf_ex.e_phnum; i++, elf_ppnt++) + for (i = 0; i < interp_elf_ex->e_phnum; i++, elf_ppnt++) switch (elf_ppnt->p_type) { case PT_LOPROC ... PT_HIPROC: - retval = arch_elf_pt_proc(&loc->interp_elf_ex, + retval = arch_elf_pt_proc(interp_elf_ex, elf_ppnt, interpreter, true, &arch_state); if (retval) @@ -839,7 +838,7 @@ out_free_interp: * the exec syscall. */ retval = arch_check_elf(elf_ex, - !!interpreter, &loc->interp_elf_ex, + !!interpreter, interp_elf_ex, &arch_state); if (retval) goto out_free_dentry; @@ -1055,7 +1054,7 @@ out_free_interp: } if (interpreter) { - elf_entry = load_elf_interp(&loc->interp_elf_ex, + elf_entry = load_elf_interp(interp_elf_ex, interpreter, load_bias, interp_elf_phdata); if (!IS_ERR((void *)elf_entry)) { @@ -1064,7 +1063,7 @@ out_free_interp: * adjustment */ interp_load_addr = elf_entry; - elf_entry += loc->interp_elf_ex.e_entry; + elf_entry += interp_elf_ex->e_entry; } if (BAD_ADDR(elf_entry)) { retval = IS_ERR((void *)elf_entry) ? @@ -1075,6 +1074,9 @@ out_free_interp: allow_write_access(interpreter); fput(interpreter); + + kfree(interp_elf_ex); + kfree(interp_elf_phdata); } else { elf_entry = e_entry; if (BAD_ADDR(elf_entry)) { @@ -1083,7 +1085,6 @@ out_free_interp: } } - kfree(interp_elf_phdata); kfree(elf_phdata); set_binfmt(&elf_format); @@ -1153,12 +1154,11 @@ out_free_interp: start_thread(regs, elf_entry, bprm->p); retval = 0; out: - kfree(loc); -out_ret: return retval; /* error cleanup */ out_free_dentry: + kfree(interp_elf_ex); kfree(interp_elf_phdata); allow_write_access(interpreter); if (interpreter) @@ -1317,7 +1317,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, } /* Hugetlb memory check */ - if (vma->vm_flags & VM_HUGETLB) { + if (is_vm_hugetlb_page(vma)) { if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED)) goto whole; if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE)) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index eee3c92a9ebf85d4d6509018ffa03ed211a3bdc4..8c596641a72b09733920dd12643194e2bd4b3ab1 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -218,13 +218,18 @@ struct eventpoll { struct file *file; /* used to optimize loop detection check */ - int visited; struct list_head visited_list_link; + int visited; #ifdef CONFIG_NET_RX_BUSY_POLL /* used to track busy poll napi_id */ unsigned int napi_id; #endif + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* tracks wakeup nests for lockdep validation */ + u8 nests; +#endif }; /* Wait structure used by the poll hooks */ @@ -545,30 +550,47 @@ out_unlock: */ #ifdef CONFIG_DEBUG_LOCK_ALLOC -static DEFINE_PER_CPU(int, wakeup_nest); - -static void ep_poll_safewake(wait_queue_head_t *wq) +static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi) { + struct eventpoll *ep_src; unsigned long flags; - int subclass; + u8 nests = 0; - local_irq_save(flags); - preempt_disable(); - subclass = __this_cpu_read(wakeup_nest); - spin_lock_nested(&wq->lock, subclass + 1); - __this_cpu_inc(wakeup_nest); - wake_up_locked_poll(wq, POLLIN); - __this_cpu_dec(wakeup_nest); - spin_unlock(&wq->lock); - local_irq_restore(flags); - preempt_enable(); + /* + * To set the subclass or nesting level for spin_lock_irqsave_nested() + * it might be natural to create a per-cpu nest count. However, since + * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can + * schedule() in the -rt kernel, the per-cpu variable are no longer + * protected. Thus, we are introducing a per eventpoll nest field. + * If we are not being call from ep_poll_callback(), epi is NULL and + * we are at the first level of nesting, 0. Otherwise, we are being + * called from ep_poll_callback() and if a previous wakeup source is + * not an epoll file itself, we are at depth 1 since the wakeup source + * is depth 0. If the wakeup source is a previous epoll file in the + * wakeup chain then we use its nests value and record ours as + * nests + 1. The previous epoll file nests value is stable since its + * already holding its own poll_wait.lock. + */ + if (epi) { + if ((is_file_epoll(epi->ffd.file))) { + ep_src = epi->ffd.file->private_data; + nests = ep_src->nests; + } else { + nests = 1; + } + } + spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests); + ep->nests = nests + 1; + wake_up_locked_poll(&ep->poll_wait, EPOLLIN); + ep->nests = 0; + spin_unlock_irqrestore(&ep->poll_wait.lock, flags); } #else -static void ep_poll_safewake(wait_queue_head_t *wq) +static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi) { - wake_up_poll(wq, EPOLLIN); + wake_up_poll(&ep->poll_wait, EPOLLIN); } #endif @@ -789,7 +811,7 @@ static void ep_free(struct eventpoll *ep) /* We need to release all tasks waiting for these file */ if (waitqueue_active(&ep->poll_wait)) - ep_poll_safewake(&ep->poll_wait); + ep_poll_safewake(ep, NULL); /* * We need to lock this because we could be hit by @@ -1258,7 +1280,7 @@ out_unlock: /* We have to call this outside the lock */ if (pwake) - ep_poll_safewake(&ep->poll_wait); + ep_poll_safewake(ep, epi); if (!(epi->event.events & EPOLLEXCLUSIVE)) ewake = 1; @@ -1562,7 +1584,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, /* We have to call this outside the lock */ if (pwake) - ep_poll_safewake(&ep->poll_wait); + ep_poll_safewake(ep, NULL); return 0; @@ -1666,7 +1688,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, /* We have to call this outside the lock */ if (pwake) - ep_poll_safewake(&ep->poll_wait); + ep_poll_safewake(ep, NULL); return 0; } diff --git a/fs/proc/array.c b/fs/proc/array.c index 5efaf3708ec69e5ff41525c8faae06773c231e31..8e16f14bb05af7eb9b0e0f31eaa8165b622560d9 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -635,28 +635,35 @@ int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns, int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { - unsigned long size = 0, resident = 0, shared = 0, text = 0, data = 0; struct mm_struct *mm = get_task_mm(task); if (mm) { + unsigned long size; + unsigned long resident = 0; + unsigned long shared = 0; + unsigned long text = 0; + unsigned long data = 0; + size = task_statm(mm, &shared, &text, &data, &resident); mmput(mm); - } - /* - * For quick read, open code by putting numbers directly - * expected format is - * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n", - * size, resident, shared, text, data); - */ - seq_put_decimal_ull(m, "", size); - seq_put_decimal_ull(m, " ", resident); - seq_put_decimal_ull(m, " ", shared); - seq_put_decimal_ull(m, " ", text); - seq_put_decimal_ull(m, " ", 0); - seq_put_decimal_ull(m, " ", data); - seq_put_decimal_ull(m, " ", 0); - seq_putc(m, '\n'); + /* + * For quick read, open code by putting numbers directly + * expected format is + * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n", + * size, resident, shared, text, data); + */ + seq_put_decimal_ull(m, "", size); + seq_put_decimal_ull(m, " ", resident); + seq_put_decimal_ull(m, " ", shared); + seq_put_decimal_ull(m, " ", text); + seq_put_decimal_ull(m, " ", 0); + seq_put_decimal_ull(m, " ", data); + seq_put_decimal_ull(m, " ", 0); + seq_putc(m, '\n'); + } else { + seq_write(m, "0 0 0 0 0 0 0\n", 14); + } return 0; } diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c index c1dea9b8222e1ab75bb61412b7b0cecbc7390a37..d0989a443c77dfe2b99207697a391bef17032c92 100644 --- a/fs/proc/cpuinfo.c +++ b/fs/proc/cpuinfo.c @@ -17,6 +17,7 @@ static int cpuinfo_open(struct inode *inode, struct file *file) } static const struct proc_ops cpuinfo_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = cpuinfo_open, .proc_read = seq_read, .proc_lseek = seq_lseek, diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 3faed94e4b6596f6774d2e8ec258f4454e169209..4ed6dabdf6ffeb90778d685e687735e6f81c4ea7 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -531,6 +531,12 @@ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode, return p; } +static inline void pde_set_flags(struct proc_dir_entry *pde) +{ + if (pde->proc_ops->proc_flags & PROC_ENTRY_PERMANENT) + pde->flags |= PROC_ENTRY_PERMANENT; +} + struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct proc_ops *proc_ops, void *data) @@ -541,6 +547,7 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, if (!p) return NULL; p->proc_ops = proc_ops; + pde_set_flags(p); return proc_register(parent, p); } EXPORT_SYMBOL(proc_create_data); @@ -572,6 +579,7 @@ static int proc_seq_release(struct inode *inode, struct file *file) } static const struct proc_ops proc_seq_ops = { + /* not permanent -- can call into arbitrary seq_operations */ .proc_open = proc_seq_open, .proc_read = seq_read, .proc_lseek = seq_lseek, @@ -602,6 +610,7 @@ static int proc_single_open(struct inode *inode, struct file *file) } static const struct proc_ops proc_single_ops = { + /* not permanent -- can call into arbitrary ->single_show */ .proc_open = proc_single_open, .proc_read = seq_read, .proc_lseek = seq_lseek, @@ -662,9 +671,13 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) de = pde_subdir_find(parent, fn, len); if (de) { - rb_erase(&de->subdir_node, &parent->subdir); - if (S_ISDIR(de->mode)) { - parent->nlink--; + if (unlikely(pde_is_permanent(de))) { + WARN(1, "removing permanent /proc entry '%s'", de->name); + de = NULL; + } else { + rb_erase(&de->subdir_node, &parent->subdir); + if (S_ISDIR(de->mode)) + parent->nlink--; } } write_unlock(&proc_subdir_lock); @@ -700,12 +713,24 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) write_unlock(&proc_subdir_lock); return -ENOENT; } + if (unlikely(pde_is_permanent(root))) { + write_unlock(&proc_subdir_lock); + WARN(1, "removing permanent /proc entry '%s/%s'", + root->parent->name, root->name); + return -EINVAL; + } rb_erase(&root->subdir_node, &parent->subdir); de = root; while (1) { next = pde_subdir_first(de); if (next) { + if (unlikely(pde_is_permanent(root))) { + write_unlock(&proc_subdir_lock); + WARN(1, "removing permanent /proc entry '%s/%s'", + next->parent->name, next->name); + return -EINVAL; + } rb_erase(&next->subdir_node, &de->subdir); de = next; continue; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 1e730ea1dcd646b9168a67df99e4ff4bc978cfd8..fb4cace9ea412ecaf31d225b677f1be4e52aaca5 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -202,6 +202,7 @@ static void unuse_pde(struct proc_dir_entry *pde) /* pde is locked on entry, unlocked on exit */ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) + __releases(&pde->pde_unload_lock) { /* * close() (proc_reg_release()) can't delete an entry and proceed: @@ -258,135 +259,204 @@ void proc_entry_rundown(struct proc_dir_entry *de) spin_unlock(&de->pde_unload_lock); } +static loff_t pde_lseek(struct proc_dir_entry *pde, struct file *file, loff_t offset, int whence) +{ + typeof_member(struct proc_ops, proc_lseek) lseek; + + lseek = pde->proc_ops->proc_lseek; + if (!lseek) + lseek = default_llseek; + return lseek(file, offset, whence); +} + static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) { struct proc_dir_entry *pde = PDE(file_inode(file)); loff_t rv = -EINVAL; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_lseek) lseek; - lseek = pde->proc_ops->proc_lseek; - if (!lseek) - lseek = default_llseek; - rv = lseek(file, offset, whence); + if (pde_is_permanent(pde)) { + return pde_lseek(pde, file, offset, whence); + } else if (use_pde(pde)) { + rv = pde_lseek(pde, file, offset, whence); unuse_pde(pde); } return rv; } +static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + typeof_member(struct proc_ops, proc_read) read; + + read = pde->proc_ops->proc_read; + if (read) + return read(file, buf, count, ppos); + return -EIO; +} + static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_read) read; - read = pde->proc_ops->proc_read; - if (read) - rv = read(file, buf, count, ppos); + if (pde_is_permanent(pde)) { + return pde_read(pde, file, buf, count, ppos); + } else if (use_pde(pde)) { + rv = pde_read(pde, file, buf, count, ppos); unuse_pde(pde); } return rv; } +static ssize_t pde_write(struct proc_dir_entry *pde, struct file *file, const char __user *buf, size_t count, loff_t *ppos) +{ + typeof_member(struct proc_ops, proc_write) write; + + write = pde->proc_ops->proc_write; + if (write) + return write(file, buf, count, ppos); + return -EIO; +} + static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_write) write; - write = pde->proc_ops->proc_write; - if (write) - rv = write(file, buf, count, ppos); + if (pde_is_permanent(pde)) { + return pde_write(pde, file, buf, count, ppos); + } else if (use_pde(pde)) { + rv = pde_write(pde, file, buf, count, ppos); unuse_pde(pde); } return rv; } +static __poll_t pde_poll(struct proc_dir_entry *pde, struct file *file, struct poll_table_struct *pts) +{ + typeof_member(struct proc_ops, proc_poll) poll; + + poll = pde->proc_ops->proc_poll; + if (poll) + return poll(file, pts); + return DEFAULT_POLLMASK; +} + static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts) { struct proc_dir_entry *pde = PDE(file_inode(file)); __poll_t rv = DEFAULT_POLLMASK; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_poll) poll; - poll = pde->proc_ops->proc_poll; - if (poll) - rv = poll(file, pts); + if (pde_is_permanent(pde)) { + return pde_poll(pde, file, pts); + } else if (use_pde(pde)) { + rv = pde_poll(pde, file, pts); unuse_pde(pde); } return rv; } +static long pde_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg) +{ + typeof_member(struct proc_ops, proc_ioctl) ioctl; + + ioctl = pde->proc_ops->proc_ioctl; + if (ioctl) + return ioctl(file, cmd, arg); + return -ENOTTY; +} + static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_ioctl) ioctl; - ioctl = pde->proc_ops->proc_ioctl; - if (ioctl) - rv = ioctl(file, cmd, arg); + if (pde_is_permanent(pde)) { + return pde_ioctl(pde, file, cmd, arg); + } else if (use_pde(pde)) { + rv = pde_ioctl(pde, file, cmd, arg); unuse_pde(pde); } return rv; } #ifdef CONFIG_COMPAT +static long pde_compat_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg) +{ + typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl; + + compat_ioctl = pde->proc_ops->proc_compat_ioctl; + if (compat_ioctl) + return compat_ioctl(file, cmd, arg); + return -ENOTTY; +} + static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl; - - compat_ioctl = pde->proc_ops->proc_compat_ioctl; - if (compat_ioctl) - rv = compat_ioctl(file, cmd, arg); + if (pde_is_permanent(pde)) { + return pde_compat_ioctl(pde, file, cmd, arg); + } else if (use_pde(pde)) { + rv = pde_compat_ioctl(pde, file, cmd, arg); unuse_pde(pde); } return rv; } #endif +static int pde_mmap(struct proc_dir_entry *pde, struct file *file, struct vm_area_struct *vma) +{ + typeof_member(struct proc_ops, proc_mmap) mmap; + + mmap = pde->proc_ops->proc_mmap; + if (mmap) + return mmap(file, vma); + return -EIO; +} + static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) { struct proc_dir_entry *pde = PDE(file_inode(file)); int rv = -EIO; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_mmap) mmap; - mmap = pde->proc_ops->proc_mmap; - if (mmap) - rv = mmap(file, vma); + if (pde_is_permanent(pde)) { + return pde_mmap(pde, file, vma); + } else if (use_pde(pde)) { + rv = pde_mmap(pde, file, vma); unuse_pde(pde); } return rv; } static unsigned long -proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, +pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - struct proc_dir_entry *pde = PDE(file_inode(file)); - unsigned long rv = -EIO; - - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_get_unmapped_area) get_area; + typeof_member(struct proc_ops, proc_get_unmapped_area) get_area; - get_area = pde->proc_ops->proc_get_unmapped_area; + get_area = pde->proc_ops->proc_get_unmapped_area; #ifdef CONFIG_MMU - if (!get_area) - get_area = current->mm->get_unmapped_area; + if (!get_area) + get_area = current->mm->get_unmapped_area; #endif + if (get_area) + return get_area(file, orig_addr, len, pgoff, flags); + return orig_addr; +} + +static unsigned long +proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct proc_dir_entry *pde = PDE(file_inode(file)); + unsigned long rv = -EIO; - if (get_area) - rv = get_area(file, orig_addr, len, pgoff, flags); - else - rv = orig_addr; + if (pde_is_permanent(pde)) { + return pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags); + } else if (use_pde(pde)) { + rv = pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags); unuse_pde(pde); } return rv; @@ -400,6 +470,13 @@ static int proc_reg_open(struct inode *inode, struct file *file) typeof_member(struct proc_ops, proc_release) release; struct pde_opener *pdeo; + if (pde_is_permanent(pde)) { + open = pde->proc_ops->proc_open; + if (open) + rv = open(inode, file); + return rv; + } + /* * Ensure that * 1) PDE's ->release hook will be called no matter what @@ -449,6 +526,17 @@ static int proc_reg_release(struct inode *inode, struct file *file) { struct proc_dir_entry *pde = PDE(inode); struct pde_opener *pdeo; + + if (pde_is_permanent(pde)) { + typeof_member(struct proc_ops, proc_release) release; + + release = pde->proc_ops->proc_release; + if (release) { + return release(inode, file); + } + return 0; + } + spin_lock(&pde->pde_unload_lock); list_for_each_entry(pdeo, &pde->pde_openers, lh) { if (pdeo->file == file) { diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 9e294f0290e5ab7de37a92e85632d94b26c1dcad..917cc85e346630c11190085ce8ff191439dfb7dd 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -61,6 +61,7 @@ struct proc_dir_entry { struct rb_node subdir_node; char *name; umode_t mode; + u8 flags; u8 namelen; char inline_name[]; } __randomize_layout; @@ -73,6 +74,11 @@ struct proc_dir_entry { 0) #define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE - sizeof(struct proc_dir_entry)) +static inline bool pde_is_permanent(const struct proc_dir_entry *pde) +{ + return pde->flags & PROC_ENTRY_PERMANENT; +} + extern struct kmem_cache *proc_dir_entry_cache; void pde_free(struct proc_dir_entry *pde); diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index ec1b7d2fb77356cfd128924c71590a3e53ff9d16..b38ad552887fb5e692787ace19861855b1e4ed0c 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -50,6 +50,7 @@ static __poll_t kmsg_poll(struct file *file, poll_table *wait) static const struct proc_ops kmsg_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_read = kmsg_read, .proc_poll = kmsg_poll, .proc_open = kmsg_open, diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 0449edf460f5b50c1cb6f203dbd6887908e9248e..46b3293015fe61f78dd14781aaabc4b9491da1a2 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -224,6 +224,7 @@ static int stat_open(struct inode *inode, struct file *file) } static const struct proc_ops stat_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = stat_open, .proc_read = seq_read, .proc_lseek = seq_lseek, diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3ba9ae83bff5a54a98d831c9cee416e06fa41a08..8d382d4ec0672f32549ac9b2dd9d156fb16dd1da 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -123,38 +123,14 @@ static void release_task_mempolicy(struct proc_maps_private *priv) } #endif -static void vma_stop(struct proc_maps_private *priv) -{ - struct mm_struct *mm = priv->mm; - - release_task_mempolicy(priv); - up_read(&mm->mmap_sem); - mmput(mm); -} - -static struct vm_area_struct * -m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma) -{ - if (vma == priv->tail_vma) - return NULL; - return vma->vm_next ?: priv->tail_vma; -} - -static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma) -{ - if (m->count < m->size) /* vma is copied successfully */ - m->version = m_next_vma(m->private, vma) ? vma->vm_end : -1UL; -} - static void *m_start(struct seq_file *m, loff_t *ppos) { struct proc_maps_private *priv = m->private; - unsigned long last_addr = m->version; + unsigned long last_addr = *ppos; struct mm_struct *mm; struct vm_area_struct *vma; - unsigned int pos = *ppos; - /* See m_cache_vma(). Zero at the start or after lseek. */ + /* See m_next(). Zero at the start or after lseek. */ if (last_addr == -1UL) return NULL; @@ -163,64 +139,59 @@ static void *m_start(struct seq_file *m, loff_t *ppos) return ERR_PTR(-ESRCH); mm = priv->mm; - if (!mm || !mmget_not_zero(mm)) + if (!mm || !mmget_not_zero(mm)) { + put_task_struct(priv->task); + priv->task = NULL; return NULL; + } if (down_read_killable(&mm->mmap_sem)) { mmput(mm); + put_task_struct(priv->task); + priv->task = NULL; return ERR_PTR(-EINTR); } hold_task_mempolicy(priv); priv->tail_vma = get_gate_vma(mm); - if (last_addr) { - vma = find_vma(mm, last_addr - 1); - if (vma && vma->vm_start <= last_addr) - vma = m_next_vma(priv, vma); - if (vma) - return vma; - } - - m->version = 0; - if (pos < mm->map_count) { - for (vma = mm->mmap; pos; pos--) { - m->version = vma->vm_start; - vma = vma->vm_next; - } + vma = find_vma(mm, last_addr); + if (vma) return vma; - } - - /* we do not bother to update m->version in this case */ - if (pos == mm->map_count && priv->tail_vma) - return priv->tail_vma; - vma_stop(priv); - return NULL; + return priv->tail_vma; } -static void *m_next(struct seq_file *m, void *v, loff_t *pos) +static void *m_next(struct seq_file *m, void *v, loff_t *ppos) { struct proc_maps_private *priv = m->private; - struct vm_area_struct *next; + struct vm_area_struct *next, *vma = v; + + if (vma == priv->tail_vma) + next = NULL; + else if (vma->vm_next) + next = vma->vm_next; + else + next = priv->tail_vma; + + *ppos = next ? next->vm_start : -1UL; - (*pos)++; - next = m_next_vma(priv, v); - if (!next) - vma_stop(priv); return next; } static void m_stop(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; + struct mm_struct *mm = priv->mm; - if (!IS_ERR_OR_NULL(v)) - vma_stop(priv); - if (priv->task) { - put_task_struct(priv->task); - priv->task = NULL; - } + if (!priv->task) + return; + + release_task_mempolicy(priv); + up_read(&mm->mmap_sem); + mmput(mm); + put_task_struct(priv->task); + priv->task = NULL; } static int proc_maps_open(struct inode *inode, struct file *file, @@ -363,7 +334,6 @@ done: static int show_map(struct seq_file *m, void *v) { show_map_vma(m, v); - m_cache_vma(m, v); return 0; } @@ -847,8 +817,6 @@ static int show_smap(struct seq_file *m, void *v) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); show_smap_vma_flags(m, vma); - m_cache_vma(m, vma); - return 0; } @@ -1887,7 +1855,6 @@ static int show_numa_map(struct seq_file *m, void *v) seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10); out: seq_putc(m, '\n'); - m_cache_vma(m, vma); return 0; } diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c index 4075e41408b478227b2a45ea34264b56560e5722..5129efc6f2e6a0fa549fd7a7791e11bf195ec368 100644 --- a/fs/reiserfs/do_balan.c +++ b/fs/reiserfs/do_balan.c @@ -842,7 +842,7 @@ static void balance_leaf_paste_right_whole(struct tree_balance *tb, struct item_head *pasted; struct buffer_info bi; - buffer_info_init_right(tb, &bi); + buffer_info_init_right(tb, &bi); leaf_shift_right(tb, tb->rnum[0], tb->rbytes); /* append item in R[0] */ diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 45e1a5d11af39a0b913934ecbefbf01c4f6336f3..adb21bea3d600e7f079d4868d44279c39904c99f 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -184,11 +184,12 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) } /* we need to make sure nobody is changing the file size beneath us */ -{ - int depth = reiserfs_write_unlock_nested(inode->i_sb); - inode_lock(inode); - reiserfs_write_lock_nested(inode->i_sb, depth); -} + { + int depth = reiserfs_write_unlock_nested(inode->i_sb); + + inode_lock(inode); + reiserfs_write_lock_nested(inode->i_sb, depth); + } reiserfs_write_lock(inode->i_sb); diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 959a066b7bb07f63e111a5990fa9b824ed83da43..1594687582f0cd47ad52c16a6bbd729af89a46bf 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -838,10 +838,10 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode */ INC_DIR_INODE_NLINK(dir) - retval = reiserfs_new_inode(&th, dir, mode, NULL /*symlink */ , - old_format_only(dir->i_sb) ? - EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE, - dentry, inode, &security); + retval = reiserfs_new_inode(&th, dir, mode, NULL /*symlink */, + old_format_only(dir->i_sb) ? + EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE, + dentry, inode, &security); if (retval) { DEC_DIR_INODE_NLINK(dir) goto out_failed; @@ -967,7 +967,7 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) reiserfs_update_sd(&th, inode); DEC_DIR_INODE_NLINK(dir) - dir->i_size -= (DEH_SIZE + de.de_entrylen); + dir->i_size -= (DEH_SIZE + de.de_entrylen); reiserfs_update_sd(&th, dir); /* prevent empty directory from getting lost */ diff --git a/fs/seq_file.c b/fs/seq_file.c index 1600034a929bb1c89df6ce7134ecaad994bd1491..79781ebd2145e48d6a8735e1f44744054e03029b 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -67,13 +67,6 @@ int seq_open(struct file *file, const struct seq_operations *op) // to the lifetime of the file. p->file = file; - /* - * Wrappers around seq_open(e.g. swaps_open) need to be - * aware of this. If they set f_version themselves, they - * should call seq_open first and then set f_version. - */ - file->f_version = 0; - /* * seq_files support lseek() and pread(). They do not implement * write() at all, but we clear FMODE_PWRITE here for historical @@ -94,7 +87,6 @@ static int traverse(struct seq_file *m, loff_t offset) int error = 0; void *p; - m->version = 0; m->index = 0; m->count = m->from = 0; if (!offset) @@ -160,26 +152,12 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) mutex_lock(&m->lock); - /* - * seq_file->op->..m_start/m_stop/m_next may do special actions - * or optimisations based on the file->f_version, so we want to - * pass the file->f_version to those methods. - * - * seq_file->version is just copy of f_version, and seq_file - * methods can treat it simply as file version. - * It is copied in first and copied out after all operations. - * It is convenient to have it as part of structure to avoid the - * need of passing another argument to all the seq_file methods. - */ - m->version = file->f_version; - /* * if request is to read from zero offset, reset iterator to first * record as it might have been already advanced by previous requests */ if (*ppos == 0) { m->index = 0; - m->version = 0; m->count = 0; } @@ -190,7 +168,6 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) if (err) { /* With prejudice... */ m->read_pos = 0; - m->version = 0; m->index = 0; m->count = 0; goto Done; @@ -243,7 +220,6 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) m->buf = seq_buf_alloc(m->size <<= 1); if (!m->buf) goto Enomem; - m->version = 0; p = m->op->start(m, &m->index); } m->op->stop(m, p); @@ -287,7 +263,6 @@ Done: *ppos += copied; m->read_pos += copied; } - file->f_version = m->version; mutex_unlock(&m->lock); return copied; Enomem: @@ -313,7 +288,6 @@ loff_t seq_lseek(struct file *file, loff_t offset, int whence) loff_t retval = -EINVAL; mutex_lock(&m->lock); - m->version = file->f_version; switch (whence) { case SEEK_CUR: offset += file->f_pos; @@ -329,7 +303,6 @@ loff_t seq_lseek(struct file *file, loff_t offset, int whence) /* with extreme prejudice... */ file->f_pos = 0; m->read_pos = 0; - m->version = 0; m->index = 0; m->count = 0; } else { @@ -340,7 +313,6 @@ loff_t seq_lseek(struct file *file, loff_t offset, int whence) file->f_pos = offset; } } - file->f_version = m->version; mutex_unlock(&m->lock); return retval; } diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 703c1c3faa6ec02f5b31d2ea8bd27bcd67be7cb3..e39fdec8a0b08be4b6262abf6b37919fc735c38c 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -314,8 +314,11 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, if (!pmd_present(_pmd)) goto out; - if (pmd_trans_huge(_pmd)) + if (pmd_trans_huge(_pmd)) { + if (!pmd_write(_pmd) && (reason & VM_UFFD_WP)) + ret = true; goto out; + } /* * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it @@ -328,6 +331,8 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, */ if (pte_none(*pte)) ret = true; + if (!pte_write(*pte) && (reason & VM_UFFD_WP)) + ret = true; pte_unmap(pte); out: @@ -1287,10 +1292,13 @@ static __always_inline int validate_range(struct mm_struct *mm, return 0; } -static inline bool vma_can_userfault(struct vm_area_struct *vma) +static inline bool vma_can_userfault(struct vm_area_struct *vma, + unsigned long vm_flags) { - return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || - vma_is_shmem(vma); + /* FIXME: add WP support to hugetlbfs and shmem */ + return vma_is_anonymous(vma) || + ((is_vm_hugetlb_page(vma) || vma_is_shmem(vma)) && + !(vm_flags & VM_UFFD_WP)); } static int userfaultfd_register(struct userfaultfd_ctx *ctx, @@ -1322,15 +1330,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, vm_flags = 0; if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING) vm_flags |= VM_UFFD_MISSING; - if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) { + if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) vm_flags |= VM_UFFD_WP; - /* - * FIXME: remove the below error constraint by - * implementing the wprotect tracking mode. - */ - ret = -EINVAL; - goto out; - } ret = validate_range(mm, &uffdio_register.range.start, uffdio_register.range.len); @@ -1380,7 +1381,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, /* check not compatible vmas */ ret = -EINVAL; - if (!vma_can_userfault(cur)) + if (!vma_can_userfault(cur, vm_flags)) goto out_unlock; /* @@ -1408,6 +1409,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, if (end & (vma_hpagesize - 1)) goto out_unlock; } + if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE)) + goto out_unlock; /* * Check that this vma isn't already owned by a @@ -1437,7 +1440,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, do { cond_resched(); - BUG_ON(!vma_can_userfault(vma)); + BUG_ON(!vma_can_userfault(vma, vm_flags)); BUG_ON(vma->vm_userfaultfd_ctx.ctx && vma->vm_userfaultfd_ctx.ctx != ctx); WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); @@ -1492,14 +1495,24 @@ out_unlock: up_write(&mm->mmap_sem); mmput(mm); if (!ret) { + __u64 ioctls_out; + + ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC : + UFFD_API_RANGE_IOCTLS; + + /* + * Declare the WP ioctl only if the WP mode is + * specified and all checks passed with the range + */ + if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)) + ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT); + /* * Now that we scanned all vmas we can already tell * userland which ioctls methods are guaranteed to * succeed on this range. */ - if (put_user(basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC : - UFFD_API_RANGE_IOCTLS, - &user_uffdio_register->ioctls)) + if (put_user(ioctls_out, &user_uffdio_register->ioctls)) ret = -EFAULT; } out: @@ -1575,7 +1588,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, * provides for more strict behavior to notice * unregistration errors. */ - if (!vma_can_userfault(cur)) + if (!vma_can_userfault(cur, cur->vm_flags)) goto out_unlock; found = true; @@ -1589,7 +1602,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, do { cond_resched(); - BUG_ON(!vma_can_userfault(vma)); + BUG_ON(!vma_can_userfault(vma, vma->vm_flags)); /* * Nothing to do: this vma is already registered into this @@ -1724,11 +1737,12 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, ret = -EINVAL; if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src) goto out; - if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE) + if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP)) goto out; if (mmget_not_zero(ctx->mm)) { ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, - uffdio_copy.len, &ctx->mmap_changing); + uffdio_copy.len, &ctx->mmap_changing, + uffdio_copy.mode); mmput(ctx->mm); } else { return -ESRCH; @@ -1801,6 +1815,53 @@ out: return ret; } +static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + int ret; + struct uffdio_writeprotect uffdio_wp; + struct uffdio_writeprotect __user *user_uffdio_wp; + struct userfaultfd_wake_range range; + bool mode_wp, mode_dontwake; + + if (READ_ONCE(ctx->mmap_changing)) + return -EAGAIN; + + user_uffdio_wp = (struct uffdio_writeprotect __user *) arg; + + if (copy_from_user(&uffdio_wp, user_uffdio_wp, + sizeof(struct uffdio_writeprotect))) + return -EFAULT; + + ret = validate_range(ctx->mm, &uffdio_wp.range.start, + uffdio_wp.range.len); + if (ret) + return ret; + + if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE | + UFFDIO_WRITEPROTECT_MODE_WP)) + return -EINVAL; + + mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP; + mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE; + + if (mode_wp && mode_dontwake) + return -EINVAL; + + ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start, + uffdio_wp.range.len, mode_wp, + &ctx->mmap_changing); + if (ret) + return ret; + + if (!mode_wp && !mode_dontwake) { + range.start = uffdio_wp.range.start; + range.len = uffdio_wp.range.len; + wake_userfault(ctx, &range); + } + return ret; +} + static inline unsigned int uffd_ctx_features(__u64 user_features) { /* @@ -1882,6 +1943,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd, case UFFDIO_ZEROPAGE: ret = userfaultfd_zeropage(ctx, arg); break; + case UFFDIO_WRITEPROTECT: + ret = userfaultfd_writeprotect(ctx, arg); + break; } return ret; } diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index e2e2bef07dd23773054f6556e55f04804a463055..329b8c8ca703b670c149e7eaf840682c523f9d7c 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -10,6 +10,7 @@ #include <linux/mm_types.h> #include <linux/bug.h> #include <linux/errno.h> +#include <asm-generic/pgtable_uffd.h> #if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \ defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS diff --git a/include/asm-generic/pgtable_uffd.h b/include/asm-generic/pgtable_uffd.h new file mode 100644 index 0000000000000000000000000000000000000000..828966d4c28115a0315ed9d19ab7a83269e5244a --- /dev/null +++ b/include/asm-generic/pgtable_uffd.h @@ -0,0 +1,66 @@ +#ifndef _ASM_GENERIC_PGTABLE_UFFD_H +#define _ASM_GENERIC_PGTABLE_UFFD_H + +#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP +static __always_inline int pte_uffd_wp(pte_t pte) +{ + return 0; +} + +static __always_inline int pmd_uffd_wp(pmd_t pmd) +{ + return 0; +} + +static __always_inline pte_t pte_mkuffd_wp(pte_t pte) +{ + return pte; +} + +static __always_inline pmd_t pmd_mkuffd_wp(pmd_t pmd) +{ + return pmd; +} + +static __always_inline pte_t pte_clear_uffd_wp(pte_t pte) +{ + return pte; +} + +static __always_inline pmd_t pmd_clear_uffd_wp(pmd_t pmd) +{ + return pmd; +} + +static __always_inline pte_t pte_swp_mkuffd_wp(pte_t pte) +{ + return pte; +} + +static __always_inline int pte_swp_uffd_wp(pte_t pte) +{ + return 0; +} + +static __always_inline pte_t pte_swp_clear_uffd_wp(pte_t pte) +{ + return pte; +} + +static inline pmd_t pmd_swp_mkuffd_wp(pmd_t pmd) +{ + return pmd; +} + +static inline int pmd_swp_uffd_wp(pmd_t pmd) +{ + return 0; +} + +static inline pmd_t pmd_swp_clear_uffd_wp(pmd_t pmd) +{ + return pmd; +} +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ + +#endif /* _ASM_GENERIC_PGTABLE_UFFD_H */ diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index f391f6b500b4cabb42ea3918e8b6fbc1287f4395..3f1649a8cf556255beebef6102f3363e26c088da 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -13,6 +13,7 @@ #include <linux/mmu_notifier.h> #include <linux/swap.h> +#include <linux/hugetlb_inline.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/cacheflush.h> @@ -398,7 +399,7 @@ tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) * We rely on tlb_end_vma() to issue a flush, such that when we reset * these values the batch is empty. */ - tlb->vma_huge = !!(vma->vm_flags & VM_HUGETLB); + tlb->vma_huge = is_vm_hugetlb_page(vma); tlb->vma_exec = !!(vma->vm_flags & VM_EXEC); } diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 47f54b459c264fcd1bac82c313fadf555e3edbd0..9acf654f0b1911ba9293009b18c6dfec65593182 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -162,7 +162,7 @@ static inline __u8 ror8(__u8 word, unsigned int shift) * * This is safe to use for 16- and 8-bit types as well. */ -static inline __s32 sign_extend32(__u32 value, int index) +static __always_inline __s32 sign_extend32(__u32 value, int index) { __u8 shift = 31 - index; return (__s32)(value << shift) >> shift; @@ -173,7 +173,7 @@ static inline __s32 sign_extend32(__u32 value, int index) * @value: value to sign extend * @index: 0 based bit index (0<=index<64) to sign bit */ -static inline __s64 sign_extend64(__u64 value, int index) +static __always_inline __s64 sign_extend64(__u64 value, int index) { __u8 shift = 63 - index; return (__s64)(value << shift) >> shift; diff --git a/include/linux/bits.h b/include/linux/bits.h index a740bbcf3cd271c89874ea72d7ba82dbd01adab4..4671fbf28842718fa74c9664dee74f8e129dbb7e 100644 --- a/include/linux/bits.h +++ b/include/linux/bits.h @@ -18,12 +18,30 @@ * position @h. For example * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000. */ -#define GENMASK(h, l) \ +#if !defined(__ASSEMBLY__) && \ + (!defined(CONFIG_CC_IS_GCC) || CONFIG_GCC_VERSION >= 49000) +#include <linux/build_bug.h> +#define GENMASK_INPUT_CHECK(h, l) \ + (BUILD_BUG_ON_ZERO(__builtin_choose_expr( \ + __builtin_constant_p((l) > (h)), (l) > (h), 0))) +#else +/* + * BUILD_BUG_ON_ZERO is not available in h files included from asm files, + * disable the input check if that is the case. + */ +#define GENMASK_INPUT_CHECK(h, l) 0 +#endif + +#define __GENMASK(h, l) \ (((~UL(0)) - (UL(1) << (l)) + 1) & \ (~UL(0) >> (BITS_PER_LONG - 1 - (h)))) +#define GENMASK(h, l) \ + (GENMASK_INPUT_CHECK(h, l) + __GENMASK(h, l)) -#define GENMASK_ULL(h, l) \ +#define __GENMASK_ULL(h, l) \ (((~ULL(0)) - (ULL(1) << (l)) + 1) & \ (~ULL(0) >> (BITS_PER_LONG_LONG - 1 - (h)))) +#define GENMASK_ULL(h, l) \ + (GENMASK_INPUT_CHECK(h, l) + __GENMASK_ULL(h, l)) #endif /* __LINUX_BITS_H */ diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 5e88e7e33abecd9ec88003b51fdd34ab611f6eb7..034b0a644efcc49ececa0fcc9b6c95ecfd4daee2 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -347,7 +347,7 @@ static inline void *offset_to_ptr(const int *off) * compiler has support to do so. */ #define compiletime_assert(condition, msg) \ - _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__) + _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) #define compiletime_assert_atomic_type(t) \ compiletime_assert(__native_word(t), \ diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 72393a8c1a6c5daeab6bb6403252daeb2aecdb93..e970f97a7fcb1c60f993f6e4eb13e6d947d85e60 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -129,22 +129,13 @@ struct ftrace_likely_data { #define __compiler_offsetof(a, b) __builtin_offsetof(a, b) /* - * Force always-inline if the user requests it so via the .config. * Prefer gnu_inline, so that extern inline functions do not emit an * externally visible function. This makes extern inline behave as per gnu89 * semantics rather than c99. This prevents multiple symbol definition errors * of extern inline functions at link time. * A lot of inline functions can cause havoc with function tracing. - * Do not use __always_inline here, since currently it expands to inline again - * (which would break users of __always_inline). */ -#if !defined(CONFIG_OPTIMIZE_INLINING) -#define inline inline __attribute__((__always_inline__)) __gnu_inline \ - __inline_maybe_unused notrace -#else -#define inline inline __gnu_inline \ - __inline_maybe_unused notrace -#endif +#define inline inline __gnu_inline __inline_maybe_unused notrace /* * gcc provides both __inline__ and __inline as alternate spellings of diff --git a/include/linux/gfp.h b/include/linux/gfp.h index be2754841369eab70f156e1cdacb2258c87a91e5..4aba4c86c626c8509eb62ccf92bf196843878995 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -124,6 +124,8 @@ struct vm_area_struct; * * Reclaim modifiers * ~~~~~~~~~~~~~~~~~ + * Please note that all the following flags are only applicable to sleepable + * allocations (e.g. %GFP_NOWAIT and %GFP_ATOMIC will ignore them). * * %__GFP_IO can start physical IO. * diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index f2df2247026aff875f3cf6ded7aa5af6bb26cce0..cfbb0a87c5f0d02610275758df302968f860280e 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -46,7 +46,7 @@ extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, pmd_t *old_pmd, pmd_t *new_pmd); extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot, - int prot_numa); + unsigned long cp_flags); vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn, pgprot_t pgprot, bool write); diff --git a/include/linux/memory.h b/include/linux/memory.h index 0b8d791b666908d43e828a24de751ef141050d87..439a89e758d87bd8dfde8dc2fbb6821e82dc3c61 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -26,7 +26,6 @@ struct memory_block { unsigned long start_section_nr; unsigned long state; /* serialized by the dev->lock */ - int section_count; /* serialized by mem_sysfs_mutex */ int online_type; /* for passing data to online routine */ int phys_device; /* to which fru does this belong? */ struct device dev; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index f4d59155f3d47a093067b5fd32eeff3c1b48c06a..ef55115320fb9192bb0c69b2c01ec1005ce7942c 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -47,9 +47,13 @@ enum { /* Types for control the zone type of onlined and offlined memory */ enum { - MMOP_OFFLINE = -1, - MMOP_ONLINE_KEEP, + /* Offline the memory. */ + MMOP_OFFLINE = 0, + /* Online the memory. Zone depends, see default_zone_for_pfn(). */ + MMOP_ONLINE, + /* Online the memory to ZONE_NORMAL. */ MMOP_ONLINE_KERNEL, + /* Online the memory to ZONE_MOVABLE. */ MMOP_ONLINE_MOVABLE, }; @@ -113,7 +117,10 @@ extern int arch_add_memory(int nid, u64 start, u64 size, struct mhp_restrictions *restrictions); extern u64 max_mem_size; -extern bool memhp_auto_online; +extern int memhp_online_type_from_str(const char *str); + +/* Default online_type (MMOP_*) when new memory blocks are added. */ +extern int memhp_default_online_type; /* If movable_node boot option specified */ extern bool movable_node_enabled; static inline bool movable_node_is_enabled(void) diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 60d97e8fd3c0d14e085ee1f9403d0121d636d0cc..8b37c4c9222c7c7fe6b0baf316ba5ac46d97b7b1 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -98,8 +98,6 @@ struct dev_pagemap_ops { * @ref: reference count that pins the devm_memremap_pages() mapping * @internal_ref: internal reference if @ref is not provided by the caller * @done: completion for @internal_ref - * @dev: host device of the mapping for debug - * @data: private data pointer for page_free() * @type: memory type: see MEMORY_* in memory_hotplug.h * @flags: PGMAP_* flags to specify defailed behavior * @ops: method table diff --git a/include/linux/mm.h b/include/linux/mm.h index 7dd5c4ccbf85ae14299d9731fa2708f7992507b9..e2f938c5a9d8ca36c08a6491eebcb5ee04f5ab60 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -629,6 +629,12 @@ static inline bool vma_is_foreign(struct vm_area_struct *vma) return false; } + +static inline bool vma_is_accessible(struct vm_area_struct *vma) +{ + return vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC); +} + #ifdef CONFIG_SHMEM /* * The vma_is_shmem is not inline because it is used only by slow @@ -1765,9 +1771,26 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len, bool need_rmap_locks); + +/* + * Flags used by change_protection(). For now we make it a bitmap so + * that we can pass in multiple flags just like parameters. However + * for now all the callers are only use one of the flags at the same + * time. + */ +/* Whether we should allow dirty bit accounting */ +#define MM_CP_DIRTY_ACCT (1UL << 0) +/* Whether this protection change is for NUMA hints */ +#define MM_CP_PROT_NUMA (1UL << 1) +/* Whether this change is for write protecting */ +#define MM_CP_UFFD_WP (1UL << 2) /* do wp */ +#define MM_CP_UFFD_WP_RESOLVE (1UL << 3) /* Resolve wp */ +#define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \ + MM_CP_UFFD_WP_RESOLVE) + extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa); + unsigned long cp_flags); extern int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags); diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 6f2fef7b0784e0eae3cfb9220d289535e331ec45..219bef41d87c334816d761b6f9b93b31bde99b54 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -6,19 +6,20 @@ #include <linux/swap.h> /** - * page_is_file_cache - should the page be on a file LRU or anon LRU? + * page_is_file_lru - should the page be on a file LRU or anon LRU? * @page: the page to test * - * Returns 1 if @page is page cache page backed by a regular filesystem, - * or 0 if @page is anonymous, tmpfs or otherwise ram or swap backed. - * Used by functions that manipulate the LRU lists, to sort a page - * onto the right LRU list. + * Returns 1 if @page is a regular filesystem backed page cache page or a lazily + * freed anonymous page (e.g. via MADV_FREE). Returns 0 if @page is a normal + * anonymous page, a tmpfs page or otherwise ram or swap backed page. Used by + * functions that manipulate the LRU lists, to sort a page onto the right LRU + * list. * * We would like to get this info without a page flag, but the state * needs to survive until the page is last deleted from the LRU, which * could be as far down as __page_cache_release. */ -static inline int page_is_file_cache(struct page *page) +static inline int page_is_file_lru(struct page *page) { return !PageSwapBacked(page); } @@ -75,7 +76,7 @@ static __always_inline void del_page_from_lru_list(struct page *page, */ static inline enum lru_list page_lru_base_type(struct page *page) { - if (page_is_file_cache(page)) + if (page_is_file_lru(page)) return LRU_INACTIVE_FILE; return LRU_INACTIVE_ANON; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index dd555e6d23f370832f018fe6f0c0a56cf150a7f6..4aba6c0c2ba80a8101f9db201ff977aa7cdf3065 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -289,8 +289,8 @@ struct vm_userfaultfd_ctx {}; #endif /* CONFIG_USERFAULTFD */ /* - * This struct defines a memory VMM memory area. There is one of these - * per VM-area/task. A VM area is any part of the process virtual memory + * This struct describes a virtual memory area. There is one of these + * per VM-area/task. A VM area is any part of the process virtual memory * space that has a special rule for the page-fault handlers (ie a shared * library, the executable area etc). */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e84d448988b6594f67c042111953cc4a0991671d..e9892bf9eba913e21f71decc1a4d9265a4a7ac8e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -100,41 +100,6 @@ struct free_area { unsigned long nr_free; }; -/* Used for pages not on another list */ -static inline void add_to_free_area(struct page *page, struct free_area *area, - int migratetype) -{ - list_add(&page->lru, &area->free_list[migratetype]); - area->nr_free++; -} - -/* Used for pages not on another list */ -static inline void add_to_free_area_tail(struct page *page, struct free_area *area, - int migratetype) -{ - list_add_tail(&page->lru, &area->free_list[migratetype]); - area->nr_free++; -} - -#ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR -/* Used to preserve page allocation order entropy */ -void add_to_free_area_random(struct page *page, struct free_area *area, - int migratetype); -#else -static inline void add_to_free_area_random(struct page *page, - struct free_area *area, int migratetype) -{ - add_to_free_area(page, area, migratetype); -} -#endif - -/* Used for pages which are on another list */ -static inline void move_to_free_area(struct page *page, struct free_area *area, - int migratetype) -{ - list_move(&page->lru, &area->free_list[migratetype]); -} - static inline struct page *get_page_from_free_area(struct free_area *area, int migratetype) { @@ -142,15 +107,6 @@ static inline struct page *get_page_from_free_area(struct free_area *area, struct page, lru); } -static inline void del_page_from_free_area(struct page *page, - struct free_area *area) -{ - list_del(&page->lru); - __ClearPageBuddy(page); - set_page_private(page, 0); - area->nr_free--; -} - static inline bool free_area_empty(struct free_area *area, int migratetype) { return list_empty(&area->free_list[migratetype]); @@ -708,7 +664,6 @@ struct deferred_split { * Memory statistics and page replacement data structures are maintained on a * per-zone basis. */ -struct bootmem_data; typedef struct pglist_data { struct zone node_zones[MAX_NR_ZONES]; struct zonelist node_zonelists[MAX_ZONELISTS]; @@ -1187,7 +1142,9 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec) #define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK) struct mem_section_usage { +#ifdef CONFIG_SPARSEMEM_VMEMMAP DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION); +#endif /* See declaration of similar field in struct zone */ unsigned long pageblock_flags[0]; }; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 77de28bfefb0186a2516f643f112a73b792cca55..222f6f7b2bb354c1a399a7289b66c6c8eb628f76 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -63,6 +63,11 @@ * page_waitqueue(page) is a wait queue of all tasks waiting for the page * to become unlocked. * + * PG_swapbacked is set when a page uses swap as a backing storage. This are + * usually PageAnon or shmem pages but please note that even anonymous pages + * might lose their PG_swapbacked flag when they simply can be dropped (e.g. as + * a result of MADV_FREE). + * * PG_uptodate tells whether the page's contents is valid. When a read * completes, the page becomes uptodate, unless a disk I/O error happened. * @@ -163,6 +168,9 @@ enum pageflags { /* non-lru isolated movable page */ PG_isolated = PG_reclaim, + + /* Only valid for buddy pages. Used to track pages that are reported */ + PG_reported = PG_uptodate, }; #ifndef __GENERATING_BOUNDS_H @@ -431,6 +439,14 @@ TESTCLEARFLAG(Young, young, PF_ANY) PAGEFLAG(Idle, idle, PF_ANY) #endif +/* + * PageReported() is used to track reported free pages within the Buddy + * allocator. We can use the non-atomic version of the test and set + * operations as both should be shielded with the zone lock to prevent + * any possible races on the setting or clearing of the bit. + */ +__PAGEFLAG(Reported, reported, PF_NO_COMPOUND) + /* * On an anonymous page mapped into a user virtual memory area, * page->mapping points to its anon_vma, not to a struct address_space; diff --git a/include/linux/page_reporting.h b/include/linux/page_reporting.h new file mode 100644 index 0000000000000000000000000000000000000000..3b99e0ec24f22548cb00e19e2baa8968f24fe880 --- /dev/null +++ b/include/linux/page_reporting.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_PAGE_REPORTING_H +#define _LINUX_PAGE_REPORTING_H + +#include <linux/mmzone.h> +#include <linux/scatterlist.h> + +/* This value should always be a power of 2, see page_reporting_cycle() */ +#define PAGE_REPORTING_CAPACITY 32 + +struct page_reporting_dev_info { + /* function that alters pages to make them "reported" */ + int (*report)(struct page_reporting_dev_info *prdev, + struct scatterlist *sg, unsigned int nents); + + /* work struct for processing reports */ + struct delayed_work work; + + /* Current state of page reporting */ + atomic_t state; +}; + +/* Tear-down and bring-up for page reporting devices */ +void page_reporting_unregister(struct page_reporting_dev_info *prdev); +int page_reporting_register(struct page_reporting_dev_info *prdev); +#endif /*_LINUX_PAGE_REPORTING_H */ diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index f56282491a48fd4c36fddbfb9a908e1ca42c7999..a8f7bd8ea1c62983088a7811595a94a66b15c5be 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -341,9 +341,7 @@ static inline struct page *find_subpage(struct page *head, pgoff_t index) if (PageHuge(head)) return head; - VM_BUG_ON_PAGE(PageTail(head), head); - - return head + (index & (compound_nr(head) - 1)); + return head + (index & (hpage_nr_pages(head) - 1)); } struct page *find_get_entry(struct address_space *mapping, pgoff_t offset); diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index 4f052496cdfd79b4db4ac104384b212c5790f1ad..0a4f54dd4737b7e3124e9668550811066d8c6616 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -78,9 +78,9 @@ static inline s64 percpu_counter_read(struct percpu_counter *fbc) */ static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc) { - s64 ret = fbc->count; + /* Prevent reloads of fbc->count */ + s64 ret = READ_ONCE(fbc->count); - barrier(); /* Prevent reloads of fbc->count */ if (ret >= 0) return ret; return 0; diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 40a7982b7285b22afdebef41413c12b21ed71a49..45c05fd9c99dc67bfacd75560fc2499b3cc4f1ca 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -5,6 +5,7 @@ #ifndef _LINUX_PROC_FS_H #define _LINUX_PROC_FS_H +#include <linux/compiler.h> #include <linux/types.h> #include <linux/fs.h> @@ -12,7 +13,21 @@ struct proc_dir_entry; struct seq_file; struct seq_operations; +enum { + /* + * All /proc entries using this ->proc_ops instance are never removed. + * + * If in doubt, ignore this flag. + */ +#ifdef MODULE + PROC_ENTRY_PERMANENT = 0U, +#else + PROC_ENTRY_PERMANENT = 1U << 0, +#endif +}; + struct proc_ops { + unsigned int proc_flags; int (*proc_open)(struct inode *, struct file *); ssize_t (*proc_read)(struct file *, char __user *, size_t, loff_t *); ssize_t (*proc_write)(struct file *, const char __user *, size_t, loff_t *); @@ -25,7 +40,7 @@ struct proc_ops { #endif int (*proc_mmap)(struct file *, struct vm_area_struct *); unsigned long (*proc_get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); -}; +} __randomize_layout; #ifdef CONFIG_PROC_FS diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 770c2bf3aa43941336d8c61b50639cddc10c2094..1672cf6f7614554e5cbc35a7d13458992c9101dd 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -21,7 +21,6 @@ struct seq_file { size_t pad_until; loff_t index; loff_t read_pos; - u64 version; struct mutex lock; const struct seq_operations *op; int poll_event; diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index d56fefef8905a560f8865dbd1e013ea519324b7f..7a35a690122151d0bb53ee7eaecf726c4e674464 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -78,6 +78,7 @@ extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); extern int shmem_unuse(unsigned int type, bool frontswap, unsigned long *fs_pages_to_unuse); +extern bool shmem_huge_enabled(struct vm_area_struct *vma); extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end); @@ -114,15 +115,6 @@ static inline bool shmem_file(struct file *file) extern bool shmem_charge(struct inode *inode, long pages); extern void shmem_uncharge(struct inode *inode, long pages); -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE -extern bool shmem_huge_enabled(struct vm_area_struct *vma); -#else -static inline bool shmem_huge_enabled(struct vm_area_struct *vma) -{ - return false; -} -#endif - #ifdef CONFIG_SHMEM extern int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, struct vm_area_struct *dst_vma, diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 3efa97d482cbea57df8a678635d2c136f2786d40..24d49c732341ab3f85bf18b7478cca921e0317a5 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -19,4 +19,6 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries); +unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries); + #endif diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 877fd239b6fff261c74bfcb5a5fd125de3d7e741..d9b7c9132c2f66473d351a1ebfe2c9ce533ee1b9 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -68,6 +68,8 @@ static inline swp_entry_t pte_to_swp_entry(pte_t pte) if (pte_swp_soft_dirty(pte)) pte = pte_swp_clear_soft_dirty(pte); + if (pte_swp_uffd_wp(pte)) + pte = pte_swp_clear_uffd_wp(pte); arch_entry = __pte_to_swp_entry(pte); return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); } @@ -348,7 +350,8 @@ static inline void num_poisoned_pages_inc(void) } #endif -#if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION) +#if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION) || \ + defined(CONFIG_DEVICE_PRIVATE) static inline int non_swap_entry(swp_entry_t entry) { return swp_type(entry) >= MAX_SWAPFILES; diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index ac9d71e24b81ac91ab38abdea32c364a2884cedb..a8e5f3ea9bb2bb175bb8a3b60219fc40a265ac9f 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -14,6 +14,8 @@ #include <linux/userfaultfd.h> /* linux/include/uapi/linux/userfaultfd.h */ #include <linux/fcntl.h> +#include <linux/mm.h> +#include <asm-generic/pgtable_uffd.h> /* * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining @@ -34,11 +36,14 @@ extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - bool *mmap_changing); + bool *mmap_changing, __u64 mode); extern ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long len, bool *mmap_changing); +extern int mwriteprotect_range(struct mm_struct *dst_mm, + unsigned long start, unsigned long len, + bool enable_wp, bool *mmap_changing); /* mm helpers */ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, @@ -52,6 +57,23 @@ static inline bool userfaultfd_missing(struct vm_area_struct *vma) return vma->vm_flags & VM_UFFD_MISSING; } +static inline bool userfaultfd_wp(struct vm_area_struct *vma) +{ + return vma->vm_flags & VM_UFFD_WP; +} + +static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma, + pte_t pte) +{ + return userfaultfd_wp(vma) && pte_uffd_wp(pte); +} + +static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma, + pmd_t pmd) +{ + return userfaultfd_wp(vma) && pmd_uffd_wp(pmd); +} + static inline bool userfaultfd_armed(struct vm_area_struct *vma) { return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP); @@ -96,6 +118,24 @@ static inline bool userfaultfd_missing(struct vm_area_struct *vma) return false; } +static inline bool userfaultfd_wp(struct vm_area_struct *vma) +{ + return false; +} + +static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma, + pte_t pte) +{ + return false; +} + +static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma, + pmd_t pmd) +{ + return false; +} + + static inline bool userfaultfd_armed(struct vm_area_struct *vma) { return false; diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 47a3441cf4c4a4d59f588aac2c181ba1d8098c4f..ffef0f279747673f4bf4b89cf84e53a1af945e23 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -73,9 +73,12 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_TRANSPARENT_HUGEPAGE THP_FAULT_ALLOC, THP_FAULT_FALLBACK, + THP_FAULT_FALLBACK_CHARGE, THP_COLLAPSE_ALLOC, THP_COLLAPSE_ALLOC_FAILED, THP_FILE_ALLOC, + THP_FILE_FALLBACK, + THP_FILE_FALLBACK_CHARGE, THP_FILE_MAPPED, THP_SPLIT_PAGE, THP_SPLIT_PAGE_FAILED, @@ -115,6 +118,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifndef CONFIG_TRANSPARENT_HUGEPAGE #define THP_FILE_ALLOC ({ BUILD_BUG(); 0; }) +#define THP_FILE_FALLBACK ({ BUILD_BUG(); 0; }) +#define THP_FILE_FALLBACK_CHARGE ({ BUILD_BUG(); 0; }) #define THP_FILE_MAPPED ({ BUILD_BUG(); 0; }) #endif diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index d82a0f4e824dd55e74a6791e81ee76a04c4c2d1a..70e32ff096ec0c2b84a24a4b7df00def3aba8b98 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -13,6 +13,7 @@ EM( SCAN_PMD_NULL, "pmd_null") \ EM( SCAN_EXCEED_NONE_PTE, "exceed_none_pte") \ EM( SCAN_PTE_NON_PRESENT, "pte_non_present") \ + EM( SCAN_PTE_UFFD_WP, "pte_uffd_wp") \ EM( SCAN_PAGE_RO, "no_writable_page") \ EM( SCAN_LACK_REFERENCED_PAGE, "lack_referenced_page") \ EM( SCAN_PAGE_NULL, "page_null") \ diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index a1675d43777e8feb9165401af1b7c4a26d76f306..5fb7520343863648160da1564621e83422985d97 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -154,6 +154,7 @@ IF_HAVE_PG_IDLE(PG_idle, "idle" ) {VM_ACCOUNT, "account" }, \ {VM_NORESERVE, "noreserve" }, \ {VM_HUGETLB, "hugetlb" }, \ + {VM_SYNC, "sync" }, \ __VM_ARCH_SPECIFIC_1 , \ {VM_WIPEONFORK, "wipeonfork" }, \ {VM_DONTDUMP, "dontdump" }, \ diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index a5ab2973e8dc3bed647a2f099cf0580cd9e5e157..74bb594ccb2580ecf09b52f0c65c5a2bc8a771ec 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -323,7 +323,7 @@ TRACE_EVENT(mm_vmscan_writepage, TP_fast_assign( __entry->pfn = page_to_pfn(page); __entry->reclaim_flags = trace_reclaim_flags( - page_is_file_cache(page)); + page_is_file_lru(page)); ), TP_printk("page=%p pfn=%lu flags=%s", diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 48f1a7c2f1f056117ce80166fdd126d1e32c08b4..e7e98bde221f1140bf9f35aaf7e4b7d1a0f0f8d7 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -19,7 +19,8 @@ * means the userland is reading). */ #define UFFD_API ((__u64)0xAA) -#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK | \ +#define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \ + UFFD_FEATURE_EVENT_FORK | \ UFFD_FEATURE_EVENT_REMAP | \ UFFD_FEATURE_EVENT_REMOVE | \ UFFD_FEATURE_EVENT_UNMAP | \ @@ -34,7 +35,8 @@ #define UFFD_API_RANGE_IOCTLS \ ((__u64)1 << _UFFDIO_WAKE | \ (__u64)1 << _UFFDIO_COPY | \ - (__u64)1 << _UFFDIO_ZEROPAGE) + (__u64)1 << _UFFDIO_ZEROPAGE | \ + (__u64)1 << _UFFDIO_WRITEPROTECT) #define UFFD_API_RANGE_IOCTLS_BASIC \ ((__u64)1 << _UFFDIO_WAKE | \ (__u64)1 << _UFFDIO_COPY) @@ -52,6 +54,7 @@ #define _UFFDIO_WAKE (0x02) #define _UFFDIO_COPY (0x03) #define _UFFDIO_ZEROPAGE (0x04) +#define _UFFDIO_WRITEPROTECT (0x06) #define _UFFDIO_API (0x3F) /* userfaultfd ioctl ids */ @@ -68,6 +71,8 @@ struct uffdio_copy) #define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \ struct uffdio_zeropage) +#define UFFDIO_WRITEPROTECT _IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \ + struct uffdio_writeprotect) /* read() structure */ struct uffd_msg { @@ -203,13 +208,14 @@ struct uffdio_copy { __u64 dst; __u64 src; __u64 len; +#define UFFDIO_COPY_MODE_DONTWAKE ((__u64)1<<0) /* - * There will be a wrprotection flag later that allows to map - * pages wrprotected on the fly. And such a flag will be - * available if the wrprotection ioctl are implemented for the - * range according to the uffdio_register.ioctls. + * UFFDIO_COPY_MODE_WP will map the page write protected on + * the fly. UFFDIO_COPY_MODE_WP is available only if the + * write protected ioctl is implemented for the range + * according to the uffdio_register.ioctls. */ -#define UFFDIO_COPY_MODE_DONTWAKE ((__u64)1<<0) +#define UFFDIO_COPY_MODE_WP ((__u64)1<<1) __u64 mode; /* @@ -231,4 +237,24 @@ struct uffdio_zeropage { __s64 zeropage; }; +struct uffdio_writeprotect { + struct uffdio_range range; +/* + * UFFDIO_WRITEPROTECT_MODE_WP: set the flag to write protect a range, + * unset the flag to undo protection of a range which was previously + * write protected. + * + * UFFDIO_WRITEPROTECT_MODE_DONTWAKE: set the flag to avoid waking up + * any wait thread after the operation succeeds. + * + * NOTE: Write protecting a region (WP=1) is unrelated to page faults, + * therefore DONTWAKE flag is meaningless with WP=1. Removing write + * protection (WP=0) in response to a page fault wakes the faulting + * task unless DONTWAKE is set. + */ +#define UFFDIO_WRITEPROTECT_MODE_WP ((__u64)1<<0) +#define UFFDIO_WRITEPROTECT_MODE_DONTWAKE ((__u64)1<<1) + __u64 mode; +}; + #endif /* _LINUX_USERFAULTFD_H */ diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h index a1966cd7b6774944a1d818bd823de0253c4a2908..19974392d3240fad6265dd7a7a6e045636268b17 100644 --- a/include/uapi/linux/virtio_balloon.h +++ b/include/uapi/linux/virtio_balloon.h @@ -36,6 +36,7 @@ #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM 2 /* Deflate balloon on OOM */ #define VIRTIO_BALLOON_F_FREE_PAGE_HINT 3 /* VQ to report free pages */ #define VIRTIO_BALLOON_F_PAGE_POISON 4 /* Guest is using page poisoning */ +#define VIRTIO_BALLOON_F_REPORTING 5 /* Page reporting virtqueue */ /* Size of a PFN in the balloon interface. */ #define VIRTIO_BALLOON_PFN_SHIFT 12 diff --git a/init/Kconfig b/init/Kconfig index 1c12059e0f7e6b643f10176b5389c83aa9330c51..574b7215aa6af37532c14aac1572c782cd76e0cc 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -872,7 +872,7 @@ config BLK_CGROUP This option only enables generic Block IO controller infrastructure. One needs to also enable actual IO controlling logic/policy. For enabling proportional weight division of disk bandwidth in CFQ, set - CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set + CONFIG_BFQ_GROUP_IOSCHED=y; for enabling throttling policy, set CONFIG_BLK_DEV_THROTTLING=y. See Documentation/admin-guide/cgroup-v1/blkio-controller.rst for more information. @@ -1538,7 +1538,6 @@ config AIO config IO_URING bool "Enable IO uring support" if EXPERT - select ANON_INODES select IO_WQ default y help @@ -1556,6 +1555,11 @@ config ADVISE_SYSCALLS applications use these syscalls, you can disable this option to save space. +config HAVE_ARCH_USERFAULTFD_WP + bool + help + Arch has userfaultfd write protection support + config MEMBARRIER bool "Enable membarrier() system call" if EXPERT default y diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 49a05ba3000dbab9f6821e3deb3a5223c29a528a..dc8307bf2d74d3b550d2fac210dae09920ef0bcf 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -239,11 +239,10 @@ static inline void msg_tree_erase(struct posix_msg_tree_node *leaf, info->msg_tree_rightmost = rb_prev(node); rb_erase(node, &info->msg_tree); - if (info->node_cache) { + if (info->node_cache) kfree(leaf); - } else { + else info->node_cache = leaf; - } } static inline struct msg_msg *msg_get(struct mqueue_inode_info *info) diff --git a/ipc/shm.c b/ipc/shm.c index ce1ca9f7c6e97ca5035963d2d64e3c57fefe7f6c..0ba6add05b35afdb0ee7947c6ece7fb9ecc64c65 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -1332,7 +1332,7 @@ static int copy_compat_shmid_from_user(struct shmid64_ds *out, void __user *buf, } } -long compat_ksys_shmctl(int shmid, int cmd, void __user *uptr, int version) +static long compat_ksys_shmctl(int shmid, int cmd, void __user *uptr, int version) { struct ipc_namespace *ns; struct shmid64_ds sem64; diff --git a/ipc/util.c b/ipc/util.c index fe61df53775ace84da0863ea36106c2206aa7d5e..97638eb2d7cb1c9ed916f45130427a0a16a4b9f3 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -885,6 +885,7 @@ static int sysvipc_proc_release(struct inode *inode, struct file *file) } static const struct proc_ops sysvipc_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = sysvipc_proc_open, .proc_read = seq_read, .proc_lseek = seq_lseek, diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config index 7fa0c4ae6394f028fa09694b219314dd3d7d8731..8a44b93da0f3d02eb2e12da86aa7d2534540c190 100644 --- a/kernel/configs/tiny.config +++ b/kernel/configs/tiny.config @@ -6,7 +6,6 @@ CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_KERNEL_XZ=y # CONFIG_KERNEL_LZO is not set # CONFIG_KERNEL_LZ4 is not set -CONFIG_OPTIMIZE_INLINING=y # CONFIG_SLAB is not set # CONFIG_SLUB is not set CONFIG_SLOB=y diff --git a/kernel/events/core.c b/kernel/events/core.c index 81e6d80cb219efe7f6398100a40d5918ca398730..55e44417f66de887f876fd50989fd3510f190c9b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -28,6 +28,7 @@ #include <linux/export.h> #include <linux/vmalloc.h> #include <linux/hardirq.h> +#include <linux/hugetlb.h> #include <linux/rculist.h> #include <linux/uaccess.h> #include <linux/syscalls.h> @@ -7973,7 +7974,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) flags |= MAP_EXECUTABLE; if (vma->vm_flags & VM_LOCKED) flags |= MAP_LOCKED; - if (vma->vm_flags & VM_HUGETLB) + if (is_vm_hugetlb_page(vma)) flags |= MAP_HUGETLB; if (file) { diff --git a/kernel/extable.c b/kernel/extable.c index 7681f87e89ddac0678d9e32471b8cf2ff560ee50..b0ea5eb0c3b43da49b8a94aa3337d666878b6af1 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -34,7 +34,8 @@ u32 __initdata __visible main_extable_sort_needed = 1; /* Sort the kernel's built-in exception table */ void __init sort_main_extable(void) { - if (main_extable_sort_needed && __stop___ex_table > __start___ex_table) { + if (main_extable_sort_needed && + &__stop___ex_table > &__start___ex_table) { pr_notice("Sorting __ex_table...\n"); sort_extable(__start___ex_table, __stop___ex_table); } diff --git a/kernel/fork.c b/kernel/fork.c index d2a967bf85d5eed4ccad59e51dbc89da332bffeb..4385f3d639f23c2347d31061ede5081cc51616cf 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -361,6 +361,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) if (new) { *new = *orig; INIT_LIST_HEAD(&new->anon_vma_chain); + new->vm_next = new->vm_prev = NULL; } return new; } @@ -553,14 +554,15 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, if (retval) goto fail_nomem_anon_vma_fork; if (tmp->vm_flags & VM_WIPEONFORK) { - /* VM_WIPEONFORK gets a clean slate in the child. */ + /* + * VM_WIPEONFORK gets a clean slate in the child. + * Don't prepare anon_vma until fault since we don't + * copy page for current vma. + */ tmp->anon_vma = NULL; - if (anon_vma_prepare(tmp)) - goto fail_nomem_anon_vma_fork; } else if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); - tmp->vm_next = tmp->vm_prev = NULL; file = tmp->vm_file; if (file) { struct inode *inode = file_inode(file); diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index e5eb5ea7ea59824d8cbd6fd68f244657ba07a55d..5e891c3c2d93b3d19a16058f41f08e4435aa8a11 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -58,7 +58,7 @@ struct gcov_node { struct dentry *dentry; struct dentry **links; int num_loaded; - char name[0]; + char name[]; }; static const char objtree[] = OBJTREE; diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c index 801ee4b0b969b516a5f17b111d5dba9784b7e92b..acb83558e5df4ed54f5413bded226bbffe1a8d87 100644 --- a/kernel/gcov/gcc_3_4.c +++ b/kernel/gcov/gcc_3_4.c @@ -38,7 +38,7 @@ static struct gcov_info *gcov_info_head; struct gcov_fn_info { unsigned int ident; unsigned int checksum; - unsigned int n_ctrs[0]; + unsigned int n_ctrs[]; }; /** @@ -78,7 +78,7 @@ struct gcov_info { unsigned int n_functions; const struct gcov_fn_info *functions; unsigned int ctr_mask; - struct gcov_ctr_info counts[0]; + struct gcov_ctr_info counts[]; }; /** @@ -352,7 +352,7 @@ struct gcov_iterator { unsigned int count; int num_types; - struct type_info type_info[0]; + struct type_info type_info[]; }; static struct gcov_fn_info *get_func(struct gcov_iterator *iter) diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index ec37563674d62497d1d4d356ab9b547242469f63..908fdf5098c32e088e1adff65d2c4f3f2472ecad 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -68,7 +68,7 @@ struct gcov_fn_info { unsigned int ident; unsigned int lineno_checksum; unsigned int cfg_checksum; - struct gcov_ctr_info ctrs[0]; + struct gcov_ctr_info ctrs[]; }; /** diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index a9b3f660dee7b7973e93c035d1fb867276457642..16c8c605f4b0facfc751198a8450cc91405e2f1c 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -175,7 +175,6 @@ unsigned long kallsyms_lookup_name(const char *name) } return module_kallsyms_lookup_name(name); } -EXPORT_SYMBOL_GPL(kallsyms_lookup_name); int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, unsigned long), @@ -194,7 +193,6 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, } return module_kallsyms_on_each_symbol(fn, data); } -EXPORT_SYMBOL_GPL(kallsyms_on_each_symbol); static unsigned long get_symbol_pos(unsigned long addr, unsigned long *symbolsize, diff --git a/kernel/kmod.c b/kernel/kmod.c index bc6addd9152b4db58c57757f1fe0a2911a16e459..8b2b311afa957d600f13dabd6442a8657b815328 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -35,7 +35,7 @@ * (u64) THREAD_SIZE * 8UL); * * If you need less than 50 threads would mean we're dealing with systems - * smaller than 3200 pages. This assuems you are capable of having ~13M memory, + * smaller than 3200 pages. This assumes you are capable of having ~13M memory, * and this would only be an be an upper limit, after which the OOM killer * would take effect. Systems like these are very unlikely if modules are * enabled. diff --git a/kernel/module.c b/kernel/module.c index 33569a01d6e1bd1b4937676301a8119dbe4e7b4f..3447f3b7487067f2fc72831e6d2811e60d4d18f1 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4355,6 +4355,7 @@ static int modules_open(struct inode *inode, struct file *file) } static const struct proc_ops modules_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = modules_open, .proc_read = seq_read, .proc_lseek = seq_lseek, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d7fb20adabeb3027133b135e82547549826d9b2b..1ea3dddafe694cbb94554cc8999ad116afe9263b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2799,7 +2799,7 @@ static void task_numa_work(struct callback_head *work) * Skip inaccessible VMAs to avoid any confusion between * PROT_NONE and NUMA hinting ptes */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) + if (!vma_is_accessible(vma)) continue; do { diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index d1398cef3b183b183c18d0a38ec50faf70ecbb29..50c1f5f08e6f17952295dd561e519299776423ae 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -305,18 +305,6 @@ config HEADERS_INSTALL user-space program samples. It is also needed by some features such as uapi header sanity checks. -config OPTIMIZE_INLINING - def_bool y - help - This option determines if the kernel forces gcc to inline the functions - developers have marked 'inline'. Doing so takes away freedom from gcc to - do what it thinks is best, which is desirable for the gcc 3.x series of - compilers. The gcc 4.x series have a rewritten inlining algorithm and - enabling this option will generate a smaller kernel there. Hopefully - this algorithm is so good that allowing gcc 4.x and above to make the - decision will become the default in the future. Until then this option - is there to test gcc for this. - config DEBUG_SECTION_MISMATCH bool "Enable full Section mismatch analysis" help @@ -988,6 +976,18 @@ config WQ_WATCHDOG state. This can be configured through kernel parameter "workqueue.watchdog_thresh" and its sysfs counterpart. +config TEST_LOCKUP + tristate "Test module to generate lockups" + help + This builds the "test_lockup" module that helps to make sure + that watchdogs and lockup detectors are working properly. + + Depending on module parameters it could emulate soft or hard + lockup, "hung task", or locking arbitrary lock for a long time. + Also it could generate series of lockups with cooling-down periods. + + If unsure, say N. + endmenu # "Debug lockups and hangs" menu "Scheduler Debugging" @@ -1655,7 +1655,7 @@ config FAILSLAB Provide fault-injection capability for kmalloc. config FAIL_PAGE_ALLOC - bool "Fault-injection capabilitiy for alloc_pages()" + bool "Fault-injection capability for alloc_pages()" depends on FAULT_INJECTION help Provide fault-injection capability for alloc_pages(). diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan index 0e04fcb3ab3d07292c4c0964ed10fcd439ca5583..48469c95d78e6da6c0656502c6d11b1216cef3c9 100644 --- a/lib/Kconfig.ubsan +++ b/lib/Kconfig.ubsan @@ -2,18 +2,50 @@ config ARCH_HAS_UBSAN_SANITIZE_ALL bool -config UBSAN +menuconfig UBSAN bool "Undefined behaviour sanity checker" help - This option enables undefined behaviour sanity checker + This option enables the Undefined Behaviour sanity checker. Compile-time instrumentation is used to detect various undefined - behaviours in runtime. Various types of checks may be enabled - via boot parameter ubsan_handle - (see: Documentation/dev-tools/ubsan.rst). + behaviours at runtime. For more details, see: + Documentation/dev-tools/ubsan.rst + +if UBSAN + +config UBSAN_TRAP + bool "On Sanitizer warnings, abort the running kernel code" + depends on $(cc-option, -fsanitize-undefined-trap-on-error) + help + Building kernels with Sanitizer features enabled tends to grow + the kernel size by around 5%, due to adding all the debugging + text on failure paths. To avoid this, Sanitizer instrumentation + can just issue a trap. This reduces the kernel size overhead but + turns all warnings (including potentially harmless conditions) + into full exceptions that abort the running kernel code + (regardless of context, locks held, etc), which may destabilize + the system. For some system builders this is an acceptable + trade-off. + +config UBSAN_BOUNDS + bool "Perform array index bounds checking" + default UBSAN + help + This option enables detection of directly indexed out of bounds + array accesses, where the array size is known at compile time. + Note that this does not protect array overflows via bad calls + to the {str,mem}*cpy() family of functions (that is addressed + by CONFIG_FORTIFY_SOURCE). + +config UBSAN_MISC + bool "Enable all other Undefined Behavior sanity checks" + default UBSAN + help + This option enables all sanity checks that don't have their + own Kconfig options. Disable this if you only want to have + individually selected checks. config UBSAN_SANITIZE_ALL bool "Enable instrumentation for the entire kernel" - depends on UBSAN depends on ARCH_HAS_UBSAN_SANITIZE_ALL # We build with -Wno-maybe-uninitilzed, but we still want to @@ -30,7 +62,6 @@ config UBSAN_SANITIZE_ALL config UBSAN_NO_ALIGNMENT bool "Disable checking of pointers alignment" - depends on UBSAN default y if HAVE_EFFICIENT_UNALIGNED_ACCESS help This option disables the check of unaligned memory accesses. @@ -43,7 +74,9 @@ config UBSAN_ALIGNMENT config TEST_UBSAN tristate "Module for testing for undefined behavior detection" - depends on m && UBSAN + depends on m help This is a test module for UBSAN. It triggers various undefined behavior, and detect it. + +endif # if UBSAN diff --git a/lib/Makefile b/lib/Makefile index 09a8acb0cf9235598b70ac2d491a4e141ce0b2ff..685aee60de1d5ea7d72041e4884e9d5cac1cdaf8 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -87,9 +87,11 @@ obj-$(CONFIG_TEST_KMOD) += test_kmod.o obj-$(CONFIG_TEST_DEBUG_VIRTUAL) += test_debug_virtual.o obj-$(CONFIG_TEST_MEMCAT_P) += test_memcat_p.o obj-$(CONFIG_TEST_OBJAGG) += test_objagg.o +CFLAGS_test_stackinit.o += $(call cc-disable-warning, switch-unreachable) obj-$(CONFIG_TEST_STACKINIT) += test_stackinit.o obj-$(CONFIG_TEST_BLACKHOLE_DEV) += test_blackhole_dev.o obj-$(CONFIG_TEST_MEMINIT) += test_meminit.o +obj-$(CONFIG_TEST_LOCKUP) += test_lockup.o obj-$(CONFIG_TEST_LIVEPATCH) += livepatch/ @@ -221,6 +223,10 @@ obj-$(CONFIG_MEMREGION) += memregion.o obj-$(CONFIG_STMP_DEVICE) += stmp_device.o obj-$(CONFIG_IRQ_POLL) += irq_poll.o +# stackdepot.c should not be instrumented or call instrumented functions. +# Prevent the compiler from calling builtins like memcmp() or bcmp() from this +# file. +CFLAGS_stackdepot.o += -fno-builtin obj-$(CONFIG_STACKDEPOT) += stackdepot.o KASAN_SANITIZE_stackdepot.o := n KCOV_INSTRUMENT_stackdepot.o := n @@ -280,7 +286,9 @@ quiet_cmd_build_OID_registry = GEN $@ clean-files += oid_registry_data.c obj-$(CONFIG_UCS2_STRING) += ucs2_string.o +ifneq ($(CONFIG_UBSAN_TRAP),y) obj-$(CONFIG_UBSAN) += ubsan.o +endif UBSAN_SANITIZE_ubsan.o := n KASAN_SANITIZE_ubsan.o := n diff --git a/lib/bch.c b/lib/bch.c index 5db6d3a4c8a6fd3b841490a734881d35ad949779..052d3fb753a0819ac5459072acbc706611c8da1d 100644 --- a/lib/bch.c +++ b/lib/bch.c @@ -102,7 +102,7 @@ */ struct gf_poly { unsigned int deg; /* polynomial degree */ - unsigned int c[0]; /* polynomial terms */ + unsigned int c[]; /* polynomial terms */ }; /* given its degree, compute a polynomial size in bytes */ diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index aae17d9522e5e0a8265daab1870b7cfeffb54b7e..8f199f403ab50656c4d91344932204bee6ac9d17 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -1031,7 +1031,7 @@ static int __init dynamic_debug_init(void) int n = 0, entries = 0, modct = 0; int verbose_bytes = 0; - if (__start___verbose == __stop___verbose) { + if (&__start___verbose == &__stop___verbose) { pr_warn("_ddebug table is empty in a CONFIG_DYNAMIC_DEBUG build\n"); return 1; } diff --git a/lib/rbtree.c b/lib/rbtree.c index abc86c6a31777d4ed88c0ad704b2aee0588d052f..8545872e61db9880bf2112ed7561f0cd5b7d8e54 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -503,7 +503,7 @@ struct rb_node *rb_next(const struct rb_node *node) if (node->rb_right) { node = node->rb_right; while (node->rb_left) - node=node->rb_left; + node = node->rb_left; return (struct rb_node *)node; } @@ -535,7 +535,7 @@ struct rb_node *rb_prev(const struct rb_node *node) if (node->rb_left) { node = node->rb_left; while (node->rb_right) - node=node->rb_right; + node = node->rb_right; return (struct rb_node *)node; } diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 5813072bc58955acf5bceeaa19814315f8f904ee..5d63a8857f361d0062426d71ee525a7e2e25b2b8 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -832,7 +832,7 @@ EXPORT_SYMBOL(sg_miter_stop); * @buflen: The number of bytes to copy * @skip: Number of bytes to skip before copying * @to_buffer: transfer direction (true == from an sg list to a - * buffer, false == from a buffer to an sg list + * buffer, false == from a buffer to an sg list) * * Returns the number of copied bytes. * diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 81c69c08d1d157189d253a0cbe2a2bbd82cff421..2caffc64e4c82891948abb97019c0c88d1b83584 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -20,6 +20,7 @@ */ #include <linux/gfp.h> +#include <linux/interrupt.h> #include <linux/jhash.h> #include <linux/kernel.h> #include <linux/mm.h> @@ -202,9 +203,20 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries) { union handle_parts parts = { .handle = handle }; - void *slab = stack_slabs[parts.slabindex]; + void *slab; size_t offset = parts.offset << STACK_ALLOC_ALIGN; - struct stack_record *stack = slab + offset; + struct stack_record *stack; + + *entries = NULL; + if (parts.slabindex > depot_index) { + WARN(1, "slab index %d out of bounds (%d) for stack id %08x\n", + parts.slabindex, depot_index, handle); + return 0; + } + slab = stack_slabs[parts.slabindex]; + if (!slab) + return 0; + stack = slab + offset; *entries = stack->entries; return stack->size; @@ -305,3 +317,26 @@ fast_exit: return retval; } EXPORT_SYMBOL_GPL(stack_depot_save); + +static inline int in_irqentry_text(unsigned long ptr) +{ + return (ptr >= (unsigned long)&__irqentry_text_start && + ptr < (unsigned long)&__irqentry_text_end) || + (ptr >= (unsigned long)&__softirqentry_text_start && + ptr < (unsigned long)&__softirqentry_text_end); +} + +unsigned int filter_irq_stacks(unsigned long *entries, + unsigned int nr_entries) +{ + unsigned int i; + + for (i = 0; i < nr_entries; i++) { + if (in_irqentry_text(entries[i])) { + /* Include the irqentry function into the stack. */ + return i + 1; + } + } + return nr_entries; +} +EXPORT_SYMBOL_GPL(filter_irq_stacks); diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c index 61ed71c1daba9776dff638e3f01034dfae7342ea..6b13150667f50f087570c69ce8c646850c5e6f43 100644 --- a/lib/test_bitmap.c +++ b/lib/test_bitmap.c @@ -278,6 +278,8 @@ static void __init test_replace(void) unsigned int nlongs = DIV_ROUND_UP(nbits, BITS_PER_LONG); DECLARE_BITMAP(bmap, 1024); + BUILD_BUG_ON(EXP2_IN_BITS < nbits * 2); + bitmap_zero(bmap, 1024); bitmap_replace(bmap, &exp2[0 * nlongs], &exp2[1 * nlongs], exp2_to_exp3_mask, nbits); expect_eq_bitmap(bmap, exp3_0_1, nbits); diff --git a/lib/test_kmod.c b/lib/test_kmod.c index 9cf77628fc913e0e63ae29e843c34f6a685e9f3d..e651c37d56dbdd445b23ab1ab1d3d5816ed8d86d 100644 --- a/lib/test_kmod.c +++ b/lib/test_kmod.c @@ -204,7 +204,7 @@ static void test_kmod_put_module(struct kmod_test_device_info *info) case TEST_KMOD_DRIVER: break; case TEST_KMOD_FS_TYPE: - if (info && info->fs_sync && info->fs_sync->owner) + if (info->fs_sync && info->fs_sync->owner) module_put(info->fs_sync->owner); break; default: diff --git a/lib/test_lockup.c b/lib/test_lockup.c new file mode 100644 index 0000000000000000000000000000000000000000..ea09ca335b21451f964cc9f96ed35e15f46bc550 --- /dev/null +++ b/lib/test_lockup.c @@ -0,0 +1,599 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test module to generate lockups + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/delay.h> +#include <linux/sched.h> +#include <linux/sched/signal.h> +#include <linux/sched/clock.h> +#include <linux/cpu.h> +#include <linux/nmi.h> +#include <linux/mm.h> +#include <linux/uaccess.h> +#include <linux/file.h> + +static unsigned int time_secs; +module_param(time_secs, uint, 0600); +MODULE_PARM_DESC(time_secs, "lockup time in seconds, default 0"); + +static unsigned int time_nsecs; +module_param(time_nsecs, uint, 0600); +MODULE_PARM_DESC(time_nsecs, "nanoseconds part of lockup time, default 0"); + +static unsigned int cooldown_secs; +module_param(cooldown_secs, uint, 0600); +MODULE_PARM_DESC(cooldown_secs, "cooldown time between iterations in seconds, default 0"); + +static unsigned int cooldown_nsecs; +module_param(cooldown_nsecs, uint, 0600); +MODULE_PARM_DESC(cooldown_nsecs, "nanoseconds part of cooldown, default 0"); + +static unsigned int iterations = 1; +module_param(iterations, uint, 0600); +MODULE_PARM_DESC(iterations, "lockup iterations, default 1"); + +static bool all_cpus; +module_param(all_cpus, bool, 0400); +MODULE_PARM_DESC(all_cpus, "trigger lockup at all cpus at once"); + +static int wait_state; +static char *state = "R"; +module_param(state, charp, 0400); +MODULE_PARM_DESC(state, "wait in 'R' running (default), 'D' uninterruptible, 'K' killable, 'S' interruptible state"); + +static bool use_hrtimer; +module_param(use_hrtimer, bool, 0400); +MODULE_PARM_DESC(use_hrtimer, "use high-resolution timer for sleeping"); + +static bool iowait; +module_param(iowait, bool, 0400); +MODULE_PARM_DESC(iowait, "account sleep time as iowait"); + +static bool lock_read; +module_param(lock_read, bool, 0400); +MODULE_PARM_DESC(lock_read, "lock read-write locks for read"); + +static bool lock_single; +module_param(lock_single, bool, 0400); +MODULE_PARM_DESC(lock_single, "acquire locks only at one cpu"); + +static bool reacquire_locks; +module_param(reacquire_locks, bool, 0400); +MODULE_PARM_DESC(reacquire_locks, "release and reacquire locks/irq/preempt between iterations"); + +static bool touch_softlockup; +module_param(touch_softlockup, bool, 0600); +MODULE_PARM_DESC(touch_softlockup, "touch soft-lockup watchdog between iterations"); + +static bool touch_hardlockup; +module_param(touch_hardlockup, bool, 0600); +MODULE_PARM_DESC(touch_hardlockup, "touch hard-lockup watchdog between iterations"); + +static bool call_cond_resched; +module_param(call_cond_resched, bool, 0600); +MODULE_PARM_DESC(call_cond_resched, "call cond_resched() between iterations"); + +static bool measure_lock_wait; +module_param(measure_lock_wait, bool, 0400); +MODULE_PARM_DESC(measure_lock_wait, "measure lock wait time"); + +static unsigned long lock_wait_threshold = ULONG_MAX; +module_param(lock_wait_threshold, ulong, 0400); +MODULE_PARM_DESC(lock_wait_threshold, "print lock wait time longer than this in nanoseconds, default off"); + +static bool test_disable_irq; +module_param_named(disable_irq, test_disable_irq, bool, 0400); +MODULE_PARM_DESC(disable_irq, "disable interrupts: generate hard-lockups"); + +static bool disable_softirq; +module_param(disable_softirq, bool, 0400); +MODULE_PARM_DESC(disable_softirq, "disable bottom-half irq handlers"); + +static bool disable_preempt; +module_param(disable_preempt, bool, 0400); +MODULE_PARM_DESC(disable_preempt, "disable preemption: generate soft-lockups"); + +static bool lock_rcu; +module_param(lock_rcu, bool, 0400); +MODULE_PARM_DESC(lock_rcu, "grab rcu_read_lock: generate rcu stalls"); + +static bool lock_mmap_sem; +module_param(lock_mmap_sem, bool, 0400); +MODULE_PARM_DESC(lock_mmap_sem, "lock mm->mmap_sem: block procfs interfaces"); + +static unsigned long lock_rwsem_ptr; +module_param_unsafe(lock_rwsem_ptr, ulong, 0400); +MODULE_PARM_DESC(lock_rwsem_ptr, "lock rw_semaphore at address"); + +static unsigned long lock_mutex_ptr; +module_param_unsafe(lock_mutex_ptr, ulong, 0400); +MODULE_PARM_DESC(lock_mutex_ptr, "lock mutex at address"); + +static unsigned long lock_spinlock_ptr; +module_param_unsafe(lock_spinlock_ptr, ulong, 0400); +MODULE_PARM_DESC(lock_spinlock_ptr, "lock spinlock at address"); + +static unsigned long lock_rwlock_ptr; +module_param_unsafe(lock_rwlock_ptr, ulong, 0400); +MODULE_PARM_DESC(lock_rwlock_ptr, "lock rwlock at address"); + +static unsigned int alloc_pages_nr; +module_param_unsafe(alloc_pages_nr, uint, 0600); +MODULE_PARM_DESC(alloc_pages_nr, "allocate and free pages under locks"); + +static unsigned int alloc_pages_order; +module_param(alloc_pages_order, uint, 0400); +MODULE_PARM_DESC(alloc_pages_order, "page order to allocate"); + +static gfp_t alloc_pages_gfp = GFP_KERNEL; +module_param_unsafe(alloc_pages_gfp, uint, 0400); +MODULE_PARM_DESC(alloc_pages_gfp, "allocate pages with this gfp_mask, default GFP_KERNEL"); + +static bool alloc_pages_atomic; +module_param(alloc_pages_atomic, bool, 0400); +MODULE_PARM_DESC(alloc_pages_atomic, "allocate pages with GFP_ATOMIC"); + +static bool reallocate_pages; +module_param(reallocate_pages, bool, 0400); +MODULE_PARM_DESC(reallocate_pages, "free and allocate pages between iterations"); + +struct file *test_file; +struct inode *test_inode; +static char test_file_path[256]; +module_param_string(file_path, test_file_path, sizeof(test_file_path), 0400); +MODULE_PARM_DESC(file_path, "file path to test"); + +static bool test_lock_inode; +module_param_named(lock_inode, test_lock_inode, bool, 0400); +MODULE_PARM_DESC(lock_inode, "lock file -> inode -> i_rwsem"); + +static bool test_lock_mapping; +module_param_named(lock_mapping, test_lock_mapping, bool, 0400); +MODULE_PARM_DESC(lock_mapping, "lock file -> mapping -> i_mmap_rwsem"); + +static bool test_lock_sb_umount; +module_param_named(lock_sb_umount, test_lock_sb_umount, bool, 0400); +MODULE_PARM_DESC(lock_sb_umount, "lock file -> sb -> s_umount"); + +static atomic_t alloc_pages_failed = ATOMIC_INIT(0); + +static atomic64_t max_lock_wait = ATOMIC64_INIT(0); + +static struct task_struct *main_task; +static int master_cpu; + +static void test_lock(bool master, bool verbose) +{ + u64 uninitialized_var(wait_start); + + if (measure_lock_wait) + wait_start = local_clock(); + + if (lock_mutex_ptr && master) { + if (verbose) + pr_notice("lock mutex %ps\n", (void *)lock_mutex_ptr); + mutex_lock((struct mutex *)lock_mutex_ptr); + } + + if (lock_rwsem_ptr && master) { + if (verbose) + pr_notice("lock rw_semaphore %ps\n", + (void *)lock_rwsem_ptr); + if (lock_read) + down_read((struct rw_semaphore *)lock_rwsem_ptr); + else + down_write((struct rw_semaphore *)lock_rwsem_ptr); + } + + if (lock_mmap_sem && master) { + if (verbose) + pr_notice("lock mmap_sem pid=%d\n", main_task->pid); + if (lock_read) + down_read(&main_task->mm->mmap_sem); + else + down_write(&main_task->mm->mmap_sem); + } + + if (test_disable_irq) + local_irq_disable(); + + if (disable_softirq) + local_bh_disable(); + + if (disable_preempt) + preempt_disable(); + + if (lock_rcu) + rcu_read_lock(); + + if (lock_spinlock_ptr && master) { + if (verbose) + pr_notice("lock spinlock %ps\n", + (void *)lock_spinlock_ptr); + spin_lock((spinlock_t *)lock_spinlock_ptr); + } + + if (lock_rwlock_ptr && master) { + if (verbose) + pr_notice("lock rwlock %ps\n", + (void *)lock_rwlock_ptr); + if (lock_read) + read_lock((rwlock_t *)lock_rwlock_ptr); + else + write_lock((rwlock_t *)lock_rwlock_ptr); + } + + if (measure_lock_wait) { + s64 cur_wait = local_clock() - wait_start; + s64 max_wait = atomic64_read(&max_lock_wait); + + do { + if (cur_wait < max_wait) + break; + max_wait = atomic64_cmpxchg(&max_lock_wait, + max_wait, cur_wait); + } while (max_wait != cur_wait); + + if (cur_wait > lock_wait_threshold) + pr_notice_ratelimited("lock wait %lld ns\n", cur_wait); + } +} + +static void test_unlock(bool master, bool verbose) +{ + if (lock_rwlock_ptr && master) { + if (lock_read) + read_unlock((rwlock_t *)lock_rwlock_ptr); + else + write_unlock((rwlock_t *)lock_rwlock_ptr); + if (verbose) + pr_notice("unlock rwlock %ps\n", + (void *)lock_rwlock_ptr); + } + + if (lock_spinlock_ptr && master) { + spin_unlock((spinlock_t *)lock_spinlock_ptr); + if (verbose) + pr_notice("unlock spinlock %ps\n", + (void *)lock_spinlock_ptr); + } + + if (lock_rcu) + rcu_read_unlock(); + + if (disable_preempt) + preempt_enable(); + + if (disable_softirq) + local_bh_enable(); + + if (test_disable_irq) + local_irq_enable(); + + if (lock_mmap_sem && master) { + if (lock_read) + up_read(&main_task->mm->mmap_sem); + else + up_write(&main_task->mm->mmap_sem); + if (verbose) + pr_notice("unlock mmap_sem pid=%d\n", main_task->pid); + } + + if (lock_rwsem_ptr && master) { + if (lock_read) + up_read((struct rw_semaphore *)lock_rwsem_ptr); + else + up_write((struct rw_semaphore *)lock_rwsem_ptr); + if (verbose) + pr_notice("unlock rw_semaphore %ps\n", + (void *)lock_rwsem_ptr); + } + + if (lock_mutex_ptr && master) { + mutex_unlock((struct mutex *)lock_mutex_ptr); + if (verbose) + pr_notice("unlock mutex %ps\n", + (void *)lock_mutex_ptr); + } +} + +static void test_alloc_pages(struct list_head *pages) +{ + struct page *page; + unsigned int i; + + for (i = 0; i < alloc_pages_nr; i++) { + page = alloc_pages(alloc_pages_gfp, alloc_pages_order); + if (!page) { + atomic_inc(&alloc_pages_failed); + break; + } + list_add(&page->lru, pages); + } +} + +static void test_free_pages(struct list_head *pages) +{ + struct page *page, *next; + + list_for_each_entry_safe(page, next, pages, lru) + __free_pages(page, alloc_pages_order); + INIT_LIST_HEAD(pages); +} + +static void test_wait(unsigned int secs, unsigned int nsecs) +{ + if (wait_state == TASK_RUNNING) { + if (secs) + mdelay(secs * MSEC_PER_SEC); + if (nsecs) + ndelay(nsecs); + return; + } + + __set_current_state(wait_state); + if (use_hrtimer) { + ktime_t time; + + time = ns_to_ktime((u64)secs * NSEC_PER_SEC + nsecs); + schedule_hrtimeout(&time, HRTIMER_MODE_REL); + } else { + schedule_timeout(secs * HZ + nsecs_to_jiffies(nsecs)); + } +} + +static void test_lockup(bool master) +{ + u64 lockup_start = local_clock(); + unsigned int iter = 0; + LIST_HEAD(pages); + + pr_notice("Start on CPU%d\n", raw_smp_processor_id()); + + test_lock(master, true); + + test_alloc_pages(&pages); + + while (iter++ < iterations && !signal_pending(main_task)) { + + if (iowait) + current->in_iowait = 1; + + test_wait(time_secs, time_nsecs); + + if (iowait) + current->in_iowait = 0; + + if (reallocate_pages) + test_free_pages(&pages); + + if (reacquire_locks) + test_unlock(master, false); + + if (touch_softlockup) + touch_softlockup_watchdog(); + + if (touch_hardlockup) + touch_nmi_watchdog(); + + if (call_cond_resched) + cond_resched(); + + test_wait(cooldown_secs, cooldown_nsecs); + + if (reacquire_locks) + test_lock(master, false); + + if (reallocate_pages) + test_alloc_pages(&pages); + } + + pr_notice("Finish on CPU%d in %lld ns\n", raw_smp_processor_id(), + local_clock() - lockup_start); + + test_free_pages(&pages); + + test_unlock(master, true); +} + +DEFINE_PER_CPU(struct work_struct, test_works); + +static void test_work_fn(struct work_struct *work) +{ + test_lockup(!lock_single || + work == per_cpu_ptr(&test_works, master_cpu)); +} + +static bool test_kernel_ptr(unsigned long addr, int size) +{ + void *ptr = (void *)addr; + char buf; + + if (!addr) + return false; + + /* should be at least readable kernel address */ + if (access_ok(ptr, 1) || + access_ok(ptr + size - 1, 1) || + probe_kernel_address(ptr, buf) || + probe_kernel_address(ptr + size - 1, buf)) { + pr_err("invalid kernel ptr: %#lx\n", addr); + return true; + } + + return false; +} + +static bool __maybe_unused test_magic(unsigned long addr, int offset, + unsigned int expected) +{ + void *ptr = (void *)addr + offset; + unsigned int magic = 0; + + if (!addr) + return false; + + if (probe_kernel_address(ptr, magic) || magic != expected) { + pr_err("invalid magic at %#lx + %#x = %#x, expected %#x\n", + addr, offset, magic, expected); + return true; + } + + return false; +} + +static int __init test_lockup_init(void) +{ + u64 test_start = local_clock(); + + main_task = current; + + switch (state[0]) { + case 'S': + wait_state = TASK_INTERRUPTIBLE; + break; + case 'D': + wait_state = TASK_UNINTERRUPTIBLE; + break; + case 'K': + wait_state = TASK_KILLABLE; + break; + case 'R': + wait_state = TASK_RUNNING; + break; + default: + pr_err("unknown state=%s\n", state); + return -EINVAL; + } + + if (alloc_pages_atomic) + alloc_pages_gfp = GFP_ATOMIC; + + if (test_kernel_ptr(lock_spinlock_ptr, sizeof(spinlock_t)) || + test_kernel_ptr(lock_rwlock_ptr, sizeof(rwlock_t)) || + test_kernel_ptr(lock_mutex_ptr, sizeof(struct mutex)) || + test_kernel_ptr(lock_rwsem_ptr, sizeof(struct rw_semaphore))) + return -EINVAL; + +#ifdef CONFIG_DEBUG_SPINLOCK + if (test_magic(lock_spinlock_ptr, + offsetof(spinlock_t, rlock.magic), + SPINLOCK_MAGIC) || + test_magic(lock_rwlock_ptr, + offsetof(rwlock_t, magic), + RWLOCK_MAGIC) || + test_magic(lock_mutex_ptr, + offsetof(struct mutex, wait_lock.rlock.magic), + SPINLOCK_MAGIC) || + test_magic(lock_rwsem_ptr, + offsetof(struct rw_semaphore, wait_lock.magic), + SPINLOCK_MAGIC)) + return -EINVAL; +#endif + + if ((wait_state != TASK_RUNNING || + (call_cond_resched && !reacquire_locks) || + (alloc_pages_nr && gfpflags_allow_blocking(alloc_pages_gfp))) && + (test_disable_irq || disable_softirq || disable_preempt || + lock_rcu || lock_spinlock_ptr || lock_rwlock_ptr)) { + pr_err("refuse to sleep in atomic context\n"); + return -EINVAL; + } + + if (lock_mmap_sem && !main_task->mm) { + pr_err("no mm to lock mmap_sem\n"); + return -EINVAL; + } + + if (test_file_path[0]) { + test_file = filp_open(test_file_path, O_RDONLY, 0); + if (IS_ERR(test_file)) { + pr_err("cannot find file_path\n"); + return -EINVAL; + } + test_inode = file_inode(test_file); + } else if (test_lock_inode || + test_lock_mapping || + test_lock_sb_umount) { + pr_err("no file to lock\n"); + return -EINVAL; + } + + if (test_lock_inode && test_inode) + lock_rwsem_ptr = (unsigned long)&test_inode->i_rwsem; + + if (test_lock_mapping && test_file && test_file->f_mapping) + lock_rwsem_ptr = (unsigned long)&test_file->f_mapping->i_mmap_rwsem; + + if (test_lock_sb_umount && test_inode) + lock_rwsem_ptr = (unsigned long)&test_inode->i_sb->s_umount; + + pr_notice("START pid=%d time=%u +%u ns cooldown=%u +%u ns iterations=%u state=%s %s%s%s%s%s%s%s%s%s%s%s\n", + main_task->pid, time_secs, time_nsecs, + cooldown_secs, cooldown_nsecs, iterations, state, + all_cpus ? "all_cpus " : "", + iowait ? "iowait " : "", + test_disable_irq ? "disable_irq " : "", + disable_softirq ? "disable_softirq " : "", + disable_preempt ? "disable_preempt " : "", + lock_rcu ? "lock_rcu " : "", + lock_read ? "lock_read " : "", + touch_softlockup ? "touch_softlockup " : "", + touch_hardlockup ? "touch_hardlockup " : "", + call_cond_resched ? "call_cond_resched " : "", + reacquire_locks ? "reacquire_locks " : ""); + + if (alloc_pages_nr) + pr_notice("ALLOCATE PAGES nr=%u order=%u gfp=%pGg %s\n", + alloc_pages_nr, alloc_pages_order, &alloc_pages_gfp, + reallocate_pages ? "reallocate_pages " : ""); + + if (all_cpus) { + unsigned int cpu; + + cpus_read_lock(); + + preempt_disable(); + master_cpu = smp_processor_id(); + for_each_online_cpu(cpu) { + INIT_WORK(per_cpu_ptr(&test_works, cpu), test_work_fn); + queue_work_on(cpu, system_highpri_wq, + per_cpu_ptr(&test_works, cpu)); + } + preempt_enable(); + + for_each_online_cpu(cpu) + flush_work(per_cpu_ptr(&test_works, cpu)); + + cpus_read_unlock(); + } else { + test_lockup(true); + } + + if (measure_lock_wait) + pr_notice("Maximum lock wait: %lld ns\n", + atomic64_read(&max_lock_wait)); + + if (alloc_pages_nr) + pr_notice("Page allocation failed %u times\n", + atomic_read(&alloc_pages_failed)); + + pr_notice("FINISH in %llu ns\n", local_clock() - test_start); + + if (test_file) + fput(test_file); + + if (signal_pending(main_task)) + return -EINTR; + + return -EAGAIN; +} +module_init(test_lockup_init); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Konstantin Khlebnikov <khlebnikov@yandex-team.ru>"); +MODULE_DESCRIPTION("Test module to generate lockups"); diff --git a/lib/test_stackinit.c b/lib/test_stackinit.c index 2d7d257a430e6d24c78f6193419f96578496126e..f93b1e145ada71a63ff417ab1076848d8a5b74b1 100644 --- a/lib/test_stackinit.c +++ b/lib/test_stackinit.c @@ -92,8 +92,9 @@ static bool range_contains(char *haystack_start, size_t haystack_size, * @var_type: type to be tested for zeroing initialization * @which: is this a SCALAR, STRING, or STRUCT type? * @init_level: what kind of initialization is performed + * @xfail: is this test expected to fail? */ -#define DEFINE_TEST_DRIVER(name, var_type, which) \ +#define DEFINE_TEST_DRIVER(name, var_type, which, xfail) \ /* Returns 0 on success, 1 on failure. */ \ static noinline __init int test_ ## name (void) \ { \ @@ -139,13 +140,14 @@ static noinline __init int test_ ## name (void) \ for (sum = 0, i = 0; i < target_size; i++) \ sum += (check_buf[i] == 0xFF); \ \ - if (sum == 0) \ + if (sum == 0) { \ pr_info(#name " ok\n"); \ - else \ - pr_warn(#name " FAIL (uninit bytes: %d)\n", \ - sum); \ - \ - return (sum != 0); \ + return 0; \ + } else { \ + pr_warn(#name " %sFAIL (uninit bytes: %d)\n", \ + (xfail) ? "X" : "", sum); \ + return (xfail) ? 0 : 1; \ + } \ } #define DEFINE_TEST(name, var_type, which, init_level) \ /* no-op to force compiler into ignoring "uninitialized" vars */\ @@ -189,7 +191,7 @@ static noinline __init int leaf_ ## name(unsigned long sp, \ \ return (int)buf[0] | (int)buf[sizeof(buf) - 1]; \ } \ -DEFINE_TEST_DRIVER(name, var_type, which) +DEFINE_TEST_DRIVER(name, var_type, which, 0) /* Structure with no padding. */ struct test_packed { @@ -326,8 +328,14 @@ static noinline __init int leaf_switch_2_none(unsigned long sp, bool fill, return __leaf_switch_none(2, fill); } -DEFINE_TEST_DRIVER(switch_1_none, uint64_t, SCALAR); -DEFINE_TEST_DRIVER(switch_2_none, uint64_t, SCALAR); +/* + * These are expected to fail for most configurations because neither + * GCC nor Clang have a way to perform initialization of variables in + * non-code areas (i.e. in a switch statement before the first "case"). + * https://bugs.llvm.org/show_bug.cgi?id=44916 + */ +DEFINE_TEST_DRIVER(switch_1_none, uint64_t, SCALAR, 1); +DEFINE_TEST_DRIVER(switch_2_none, uint64_t, SCALAR, 1); static int __init test_stackinit_init(void) { diff --git a/lib/ts_bm.c b/lib/ts_bm.c index b352903c50e3819c7a7ccc5d91d4c91c976a9f21..277cb4417ac27a77b89494803c3b1705b4e028df 100644 --- a/lib/ts_bm.c +++ b/lib/ts_bm.c @@ -52,7 +52,7 @@ struct ts_bm u8 * pattern; unsigned int patlen; unsigned int bad_shift[ASIZE]; - unsigned int good_shift[0]; + unsigned int good_shift[]; }; static unsigned int bm_find(struct ts_config *conf, struct ts_state *state) diff --git a/lib/ts_fsm.c b/lib/ts_fsm.c index 9c873cadab7c6cb946044584398c9b8aa93134eb..ab749ec10ab538550729e5cc570107b683038a23 100644 --- a/lib/ts_fsm.c +++ b/lib/ts_fsm.c @@ -32,7 +32,7 @@ struct ts_fsm { unsigned int ntokens; - struct ts_fsm_token tokens[0]; + struct ts_fsm_token tokens[]; }; /* other values derived from ctype.h */ diff --git a/lib/ts_kmp.c b/lib/ts_kmp.c index 94617e014b3a0b009d8b976e5ecdb698f9744294..c77a3d537f2436d17e6231edf08a900c7a83c00a 100644 --- a/lib/ts_kmp.c +++ b/lib/ts_kmp.c @@ -36,7 +36,7 @@ struct ts_kmp { u8 * pattern; unsigned int pattern_len; - unsigned int prefix_tbl[0]; + unsigned int prefix_tbl[]; }; static unsigned int kmp_find(struct ts_config *conf, struct ts_state *state) diff --git a/lib/ubsan.c b/lib/ubsan.c index 7b9b58aee72c9080923f309cdc47de9756ff7c37..f8c0ccf35f296e9d5323cbafbf6aefe009b9cfe6 100644 --- a/lib/ubsan.c +++ b/lib/ubsan.c @@ -45,13 +45,6 @@ static bool was_reported(struct source_location *location) return test_and_set_bit(REPORTED_BIT, &location->reported); } -static void print_source_location(const char *prefix, - struct source_location *loc) -{ - pr_err("%s %s:%d:%d\n", prefix, loc->file_name, - loc->line & LINE_MASK, loc->column & COLUMN_MASK); -} - static bool suppress_report(struct source_location *loc) { return current->in_ubsan || was_reported(loc); @@ -140,13 +133,14 @@ static void val_to_string(char *str, size_t size, struct type_descriptor *type, } } -static void ubsan_prologue(struct source_location *location) +static void ubsan_prologue(struct source_location *loc, const char *reason) { current->in_ubsan++; pr_err("========================================" "========================================\n"); - print_source_location("UBSAN: Undefined behaviour in", location); + pr_err("UBSAN: %s in %s:%d:%d\n", reason, loc->file_name, + loc->line & LINE_MASK, loc->column & COLUMN_MASK); } static void ubsan_epilogue(void) @@ -156,6 +150,17 @@ static void ubsan_epilogue(void) "========================================\n"); current->in_ubsan--; + + if (panic_on_warn) { + /* + * This thread may hit another WARN() in the panic path. + * Resetting this prevents additional WARN() from panicking the + * system on this thread. Other threads are blocked by the + * panic_mutex in panic(). + */ + panic_on_warn = 0; + panic("panic_on_warn set ...\n"); + } } static void handle_overflow(struct overflow_data *data, void *lhs, @@ -169,12 +174,12 @@ static void handle_overflow(struct overflow_data *data, void *lhs, if (suppress_report(&data->location)) return; - ubsan_prologue(&data->location); + ubsan_prologue(&data->location, type_is_signed(type) ? + "signed-integer-overflow" : + "unsigned-integer-overflow"); val_to_string(lhs_val_str, sizeof(lhs_val_str), type, lhs); val_to_string(rhs_val_str, sizeof(rhs_val_str), type, rhs); - pr_err("%s integer overflow:\n", - type_is_signed(type) ? "signed" : "unsigned"); pr_err("%s %c %s cannot be represented in type %s\n", lhs_val_str, op, @@ -214,7 +219,7 @@ void __ubsan_handle_negate_overflow(struct overflow_data *data, if (suppress_report(&data->location)) return; - ubsan_prologue(&data->location); + ubsan_prologue(&data->location, "negation-overflow"); val_to_string(old_val_str, sizeof(old_val_str), data->type, old_val); @@ -234,7 +239,7 @@ void __ubsan_handle_divrem_overflow(struct overflow_data *data, if (suppress_report(&data->location)) return; - ubsan_prologue(&data->location); + ubsan_prologue(&data->location, "division-overflow"); val_to_string(rhs_val_str, sizeof(rhs_val_str), data->type, rhs); @@ -253,7 +258,7 @@ static void handle_null_ptr_deref(struct type_mismatch_data_common *data) if (suppress_report(data->location)) return; - ubsan_prologue(data->location); + ubsan_prologue(data->location, "null-ptr-deref"); pr_err("%s null pointer of type %s\n", type_check_kinds[data->type_check_kind], @@ -268,7 +273,7 @@ static void handle_misaligned_access(struct type_mismatch_data_common *data, if (suppress_report(data->location)) return; - ubsan_prologue(data->location); + ubsan_prologue(data->location, "misaligned-access"); pr_err("%s misaligned address %p for type %s\n", type_check_kinds[data->type_check_kind], @@ -284,7 +289,7 @@ static void handle_object_size_mismatch(struct type_mismatch_data_common *data, if (suppress_report(data->location)) return; - ubsan_prologue(data->location); + ubsan_prologue(data->location, "object-size-mismatch"); pr_err("%s address %p with insufficient space\n", type_check_kinds[data->type_check_kind], (void *) ptr); @@ -343,7 +348,7 @@ void __ubsan_handle_out_of_bounds(struct out_of_bounds_data *data, void *index) if (suppress_report(&data->location)) return; - ubsan_prologue(&data->location); + ubsan_prologue(&data->location, "array-index-out-of-bounds"); val_to_string(index_str, sizeof(index_str), data->index_type, index); pr_err("index %s is out of range for type %s\n", index_str, @@ -364,7 +369,7 @@ void __ubsan_handle_shift_out_of_bounds(struct shift_out_of_bounds_data *data, if (suppress_report(&data->location)) goto out; - ubsan_prologue(&data->location); + ubsan_prologue(&data->location, "shift-out-of-bounds"); val_to_string(rhs_str, sizeof(rhs_str), rhs_type, rhs); val_to_string(lhs_str, sizeof(lhs_str), lhs_type, lhs); @@ -396,7 +401,7 @@ EXPORT_SYMBOL(__ubsan_handle_shift_out_of_bounds); void __ubsan_handle_builtin_unreachable(struct unreachable_data *data) { - ubsan_prologue(&data->location); + ubsan_prologue(&data->location, "unreachable"); pr_err("calling __builtin_unreachable()\n"); ubsan_epilogue(); panic("can't return from __builtin_unreachable()"); @@ -411,7 +416,7 @@ void __ubsan_handle_load_invalid_value(struct invalid_value_data *data, if (suppress_report(&data->location)) return; - ubsan_prologue(&data->location); + ubsan_prologue(&data->location, "invalid-load"); val_to_string(val_str, sizeof(val_str), data->type, val); diff --git a/mm/Kconfig b/mm/Kconfig index ab80933be65ff5309de45a94a12903018ac8dcca..691021492e78c14120866f7fdd74fbbc40475038 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -236,6 +236,17 @@ config COMPACTION it and then we would be really interested to hear about that at linux-mm@kvack.org. +# +# support for free page reporting +config PAGE_REPORTING + bool "Free page reporting" + def_bool n + help + Free page reporting allows for the incremental acquisition of + free pages from the buddy allocator for the purpose of reporting + those pages to another entity, such as a hypervisor, so that the + memory can be freed within the host for other uses. + # # support for page migration # @@ -420,10 +431,6 @@ config THP_SWAP For selection by architectures with reasonable THP sizes. -config TRANSPARENT_HUGE_PAGECACHE - def_bool y - depends on TRANSPARENT_HUGEPAGE - # # UP and nommu archs use km based percpu allocator # @@ -526,7 +533,6 @@ config MEM_SOFT_DIRTY config ZSWAP bool "Compressed cache for swap pages (EXPERIMENTAL)" depends on FRONTSWAP && CRYPTO=y - select CRYPTO_LZO select ZPOOL help A lightweight compressed cache for swap pages. It takes @@ -542,6 +548,123 @@ config ZSWAP they have not be fully explored on the large set of potential configurations and workloads that exist. +choice + prompt "Compressed cache for swap pages default compressor" + depends on ZSWAP + default ZSWAP_COMPRESSOR_DEFAULT_LZO + help + Selects the default compression algorithm for the compressed cache + for swap pages. + + For an overview what kind of performance can be expected from + a particular compression algorithm please refer to the benchmarks + available at the following LWN page: + https://lwn.net/Articles/751795/ + + If in doubt, select 'LZO'. + + The selection made here can be overridden by using the kernel + command line 'zswap.compressor=' option. + +config ZSWAP_COMPRESSOR_DEFAULT_DEFLATE + bool "Deflate" + select CRYPTO_DEFLATE + help + Use the Deflate algorithm as the default compression algorithm. + +config ZSWAP_COMPRESSOR_DEFAULT_LZO + bool "LZO" + select CRYPTO_LZO + help + Use the LZO algorithm as the default compression algorithm. + +config ZSWAP_COMPRESSOR_DEFAULT_842 + bool "842" + select CRYPTO_842 + help + Use the 842 algorithm as the default compression algorithm. + +config ZSWAP_COMPRESSOR_DEFAULT_LZ4 + bool "LZ4" + select CRYPTO_LZ4 + help + Use the LZ4 algorithm as the default compression algorithm. + +config ZSWAP_COMPRESSOR_DEFAULT_LZ4HC + bool "LZ4HC" + select CRYPTO_LZ4HC + help + Use the LZ4HC algorithm as the default compression algorithm. + +config ZSWAP_COMPRESSOR_DEFAULT_ZSTD + bool "zstd" + select CRYPTO_ZSTD + help + Use the zstd algorithm as the default compression algorithm. +endchoice + +config ZSWAP_COMPRESSOR_DEFAULT + string + depends on ZSWAP + default "deflate" if ZSWAP_COMPRESSOR_DEFAULT_DEFLATE + default "lzo" if ZSWAP_COMPRESSOR_DEFAULT_LZO + default "842" if ZSWAP_COMPRESSOR_DEFAULT_842 + default "lz4" if ZSWAP_COMPRESSOR_DEFAULT_LZ4 + default "lz4hc" if ZSWAP_COMPRESSOR_DEFAULT_LZ4HC + default "zstd" if ZSWAP_COMPRESSOR_DEFAULT_ZSTD + default "" + +choice + prompt "Compressed cache for swap pages default allocator" + depends on ZSWAP + default ZSWAP_ZPOOL_DEFAULT_ZBUD + help + Selects the default allocator for the compressed cache for + swap pages. + The default is 'zbud' for compatibility, however please do + read the description of each of the allocators below before + making a right choice. + + The selection made here can be overridden by using the kernel + command line 'zswap.zpool=' option. + +config ZSWAP_ZPOOL_DEFAULT_ZBUD + bool "zbud" + select ZBUD + help + Use the zbud allocator as the default allocator. + +config ZSWAP_ZPOOL_DEFAULT_Z3FOLD + bool "z3fold" + select Z3FOLD + help + Use the z3fold allocator as the default allocator. + +config ZSWAP_ZPOOL_DEFAULT_ZSMALLOC + bool "zsmalloc" + select ZSMALLOC + help + Use the zsmalloc allocator as the default allocator. +endchoice + +config ZSWAP_ZPOOL_DEFAULT + string + depends on ZSWAP + default "zbud" if ZSWAP_ZPOOL_DEFAULT_ZBUD + default "z3fold" if ZSWAP_ZPOOL_DEFAULT_Z3FOLD + default "zsmalloc" if ZSWAP_ZPOOL_DEFAULT_ZSMALLOC + default "" + +config ZSWAP_DEFAULT_ON + bool "Enable the compressed cache for swap pages by default" + depends on ZSWAP + help + If selected, the compressed cache for swap pages will be enabled + at boot, otherwise it will be disabled. + + The selection made here can be overridden by using the kernel + command line 'zswap.enabled=' option. + config ZPOOL tristate "Common API for compressed memory storage" help @@ -714,7 +837,7 @@ config GUP_GET_PTE_LOW_HIGH config READ_ONLY_THP_FOR_FS bool "Read-only THP for filesystems (EXPERIMENTAL)" - depends on TRANSPARENT_HUGE_PAGECACHE && SHMEM + depends on TRANSPARENT_HUGEPAGE && SHMEM help Allow khugepaged to put read-only file-backed pages in THP. diff --git a/mm/Makefile b/mm/Makefile index dbc8346d16ca0f0e888084f0c0ead4944ae86639..fccd3756b25f0685ed6684c43f9f47b5d94024b8 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -111,3 +111,4 @@ obj-$(CONFIG_HMM_MIRROR) += hmm.o obj-$(CONFIG_MEMFD_CREATE) += memfd.o obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o obj-$(CONFIG_PTDUMP_CORE) += ptdump.o +obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o diff --git a/mm/compaction.c b/mm/compaction.c index df3da2f76fdc8371a759178c5f3c016a5252d21b..46f0fcc93081eded97384a8c19bdc98bf98759b2 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -481,6 +481,7 @@ static bool test_and_set_skip(struct compact_control *cc, struct page *page, */ static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags, struct compact_control *cc) + __acquires(lock) { /* Track if the lock is contended in async mode */ if (cc->mode == MIGRATE_ASYNC && !cc->contended) { @@ -989,7 +990,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* Successfully isolated */ del_page_from_lru_list(page, lruvec, page_lru(page)); mod_node_page_state(page_pgdat(page), - NR_ISOLATED_ANON + page_is_file_cache(page), + NR_ISOLATED_ANON + page_is_file_lru(page), hpage_nr_pages(page)); isolate_success: diff --git a/mm/dmapool.c b/mm/dmapool.c index fe5d33060415bfcc4eb92e993b0a2b99f628a990..f9fb9bbd733e0fd20b2527999e5882e283184100 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -144,9 +144,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, else if (size < 4) size = 4; - if ((size % align) != 0) - size = ALIGN(size, align); - + size = ALIGN(size, align); allocation = max_t(size_t, size, PAGE_SIZE); if (!boundary) diff --git a/mm/filemap.c b/mm/filemap.c index 0fbdc8e30dd2a7b3278d35af49d52dd3684dbd05..23a051a7ef0fb46936d342d7b25b166473428a6e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1693,6 +1693,11 @@ EXPORT_SYMBOL(pagecache_get_page); * Any shadow entries of evicted pages, or swap entries from * shmem/tmpfs, are included in the returned array. * + * If it finds a Transparent Huge Page, head or tail, find_get_entries() + * stops at that page: the caller is likely to have a better way to handle + * the compound page as a whole, and then skip its extent, than repeatedly + * calling find_get_entries() to return all its tails. + * * Return: the number of pages and shadow entries which were found. */ unsigned find_get_entries(struct address_space *mapping, @@ -1724,8 +1729,15 @@ unsigned find_get_entries(struct address_space *mapping, /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto put_page; - page = find_subpage(page, xas.xa_index); + /* + * Terminate early on finding a THP, to allow the caller to + * handle it all at once; but continue if this is hugetlbfs. + */ + if (PageTransHuge(page) && !PageHuge(page)) { + page = find_subpage(page, xas.xa_index); + nr_entries = ret + 1; + } export: indices[ret] = xas.xa_index; entries[ret] = page; diff --git a/mm/gup.c b/mm/gup.c index da3e031851443aa1f2af714c3af93906b361cb67..a21230569520913dedbae1ceec94f2df504eb9f4 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -351,7 +351,8 @@ static struct page *no_page_table(struct vm_area_struct *vma, * But we can only make this optimization where a hole would surely * be zero-filled if handle_mm_fault() actually did handle it. */ - if ((flags & FOLL_DUMP) && (!vma->vm_ops || !vma->vm_ops->fault)) + if ((flags & FOLL_DUMP) && + (vma_is_anonymous(vma) || !vma->vm_ops->fault)) return ERR_PTR(-EFAULT); return NULL; } @@ -1101,7 +1102,7 @@ retry: goto retry; case -EBUSY: ret = 0; - /* FALLTHRU */ + fallthrough; case -EFAULT: case -ENOMEM: case -EHWPOISON: @@ -1416,7 +1417,7 @@ long populate_vma_page_range(struct vm_area_struct *vma, * We want mlock to succeed for regions that have any permissions * other than PROT_NONE. */ - if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) + if (vma_is_accessible(vma)) gup_flags |= FOLL_FORCE; /* @@ -1676,7 +1677,7 @@ check_again: list_add_tail(&head->lru, &cma_page_list); mod_node_page_state(page_pgdat(head), NR_ISOLATED_ANON + - page_is_file_cache(head), + page_is_file_lru(head), hpage_nr_pages(head)); } } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0f9389f9d1f8652b5c50aa97186a36b42b88ccab..6ecd1045113b538586e87a00aec4022cf500c1f1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -326,7 +326,7 @@ static struct attribute *hugepage_attr[] = { &defrag_attr.attr, &use_zero_page_attr.attr, &hpage_pmd_size_attr.attr, -#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) +#ifdef CONFIG_SHMEM &shmem_enabled_attr.attr, #endif #ifdef CONFIG_DEBUG_VM @@ -597,6 +597,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); + count_vm_event(THP_FAULT_FALLBACK_CHARGE); return VM_FAULT_FALLBACK; } @@ -1043,6 +1044,14 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, ret = -EAGAIN; pmd = *src_pmd; + /* + * Make sure the _PAGE_UFFD_WP bit is cleared if the new VMA + * does not have the VM_UFFD_WP, which means that the uffd + * fork event is not enabled. + */ + if (!(vma->vm_flags & VM_UFFD_WP)) + pmd = pmd_clear_uffd_wp(pmd); + #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION if (unlikely(is_swap_pmd(pmd))) { swp_entry_t entry = pmd_to_swp_entry(pmd); @@ -1446,6 +1455,7 @@ alloc: put_page(page); ret |= VM_FAULT_FALLBACK; count_vm_event(THP_FAULT_FALLBACK); + count_vm_event(THP_FAULT_FALLBACK_CHARGE); goto out; } @@ -1977,13 +1987,16 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, * - HPAGE_PMD_NR is protections changed and TLB flush necessary */ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, pgprot_t newprot, int prot_numa) + unsigned long addr, pgprot_t newprot, unsigned long cp_flags) { struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; pmd_t entry; bool preserve_write; int ret; + bool prot_numa = cp_flags & MM_CP_PROT_NUMA; + bool uffd_wp = cp_flags & MM_CP_UFFD_WP; + bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; ptl = __pmd_trans_huge_lock(pmd, vma); if (!ptl) @@ -2050,6 +2063,17 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, entry = pmd_modify(entry, newprot); if (preserve_write) entry = pmd_mk_savedwrite(entry); + if (uffd_wp) { + entry = pmd_wrprotect(entry); + entry = pmd_mkuffd_wp(entry); + } else if (uffd_wp_resolve) { + /* + * Leave the write bit to be handled by PF interrupt + * handler, then things like COW could be properly + * handled. + */ + entry = pmd_clear_uffd_wp(entry); + } ret = HPAGE_PMD_NR; set_pmd_at(mm, addr, pmd, entry); BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry)); @@ -2198,7 +2222,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, struct page *page; pgtable_t pgtable; pmd_t old_pmd, _pmd; - bool young, write, soft_dirty, pmd_migration = false; + bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false; unsigned long addr; int i; @@ -2273,6 +2297,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, write = is_write_migration_entry(entry); young = false; soft_dirty = pmd_swp_soft_dirty(old_pmd); + uffd_wp = pmd_swp_uffd_wp(old_pmd); } else { page = pmd_page(old_pmd); if (pmd_dirty(old_pmd)) @@ -2280,6 +2305,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, write = pmd_write(old_pmd); young = pmd_young(old_pmd); soft_dirty = pmd_soft_dirty(old_pmd); + uffd_wp = pmd_uffd_wp(old_pmd); } VM_BUG_ON_PAGE(!page_count(page), page); page_ref_add(page, HPAGE_PMD_NR - 1); @@ -2304,6 +2330,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = swp_entry_to_pte(swp_entry); if (soft_dirty) entry = pte_swp_mksoft_dirty(entry); + if (uffd_wp) + entry = pte_swp_mkuffd_wp(entry); } else { entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot)); entry = maybe_mkwrite(entry, vma); @@ -2313,6 +2341,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = pte_mkold(entry); if (soft_dirty) entry = pte_mksoft_dirty(entry); + if (uffd_wp) + entry = pte_mkuffd_wp(entry); } pte = pte_offset_map(&_pmd, addr); BUG_ON(!pte_none(*pte)); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f9ea1e5197b4fc2b25863dfb7b459514fcb137e5..f5fb53fdfa02d2da4ad71e0a21552ecab1235fe7 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2010,6 +2010,7 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, * of size 'delta'. */ static int gather_surplus_pages(struct hstate *h, int delta) + __must_hold(&hugetlb_lock) { struct list_head surplus_list; struct page *page, *tmp; diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index c2d7ae6cabd1a0413f0b0e03d35cde1c626a7fd3..aabf65d4d91bacc9987f1faed41f834368b39b40 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -467,14 +467,14 @@ static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v) switch (MEMFILE_ATTR(cft->private)) { case RES_RSVD_USAGE: counter = &h_cg->rsvd_hugepage[idx]; - /* Fall through. */ + fallthrough; case RES_USAGE: val = (u64)page_counter_read(counter); seq_printf(seq, "%llu\n", val * PAGE_SIZE); break; case RES_RSVD_LIMIT: counter = &h_cg->rsvd_hugepage[idx]; - /* Fall through. */ + fallthrough; case RES_LIMIT: val = (u64)counter->max; if (val == limit) @@ -514,7 +514,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_RSVD_LIMIT: rsvd = true; - /* Fall through. */ + fallthrough; case RES_LIMIT: mutex_lock(&hugetlb_limit_mutex); ret = page_counter_set_max( diff --git a/mm/internal.h b/mm/internal.h index 2d58ae15a9580cee1299635fe897c81f8768bf75..b5634e78f01dc6aa49b0c6926ba444b69c5fcadc 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -180,6 +180,8 @@ static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn, } extern int __isolate_free_page(struct page *page, unsigned int order); +extern void __putback_isolated_page(struct page *page, unsigned int order, + int mt); extern void memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order); extern void __free_pages_core(struct page *page, unsigned int order); diff --git a/mm/kasan/common.c b/mm/kasan/common.c index e61b4a49221828628e9bd1767255bf4b9ee7b8f1..2906358e42f05287731deb8f2c7e86ddbc5b4a59 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -15,7 +15,6 @@ */ #include <linux/export.h> -#include <linux/interrupt.h> #include <linux/init.h> #include <linux/kasan.h> #include <linux/kernel.h> @@ -42,28 +41,6 @@ #include "kasan.h" #include "../slab.h" -static inline int in_irqentry_text(unsigned long ptr) -{ - return (ptr >= (unsigned long)&__irqentry_text_start && - ptr < (unsigned long)&__irqentry_text_end) || - (ptr >= (unsigned long)&__softirqentry_text_start && - ptr < (unsigned long)&__softirqentry_text_end); -} - -static inline unsigned int filter_irq_stacks(unsigned long *entries, - unsigned int nr_entries) -{ - unsigned int i; - - for (i = 0; i < nr_entries; i++) { - if (in_irqentry_text(entries[i])) { - /* Include the irqentry function into the stack. */ - return i + 1; - } - } - return nr_entries; -} - static inline depot_stack_handle_t save_stack(gfp_t flags) { unsigned long entries[KASAN_STACK_DEPTH]; diff --git a/mm/kasan/report.c b/mm/kasan/report.c index cf5c17d5e3613af6f6f116ca6cc7b710156996dd..80f23c9da6b08062f32f668ede36b81a50ba4c95 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -92,8 +92,16 @@ static void end_report(unsigned long *flags) pr_err("==================================================================\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); spin_unlock_irqrestore(&report_lock, *flags); - if (panic_on_warn) + if (panic_on_warn) { + /* + * This thread may hit another WARN() in the panic path. + * Resetting this prevents additional WARN() from panicking the + * system on this thread. Other threads are blocked by the + * panic_mutex in panic(). + */ + panic_on_warn = 0; panic("panic_on_warn set ...\n"); + } kasan_enable_current(); } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index c659c68728bc16bcd442a36629a05d548e294957..99d77ffb79c2b39b2cd1cc4dfd72c4bf1684f3d6 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -29,6 +29,7 @@ enum scan_result { SCAN_PMD_NULL, SCAN_EXCEED_NONE_PTE, SCAN_PTE_NON_PRESENT, + SCAN_PTE_UFFD_WP, SCAN_PAGE_RO, SCAN_LACK_REFERENCED_PAGE, SCAN_PAGE_NULL, @@ -414,8 +415,6 @@ static bool hugepage_vma_check(struct vm_area_struct *vma, (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file && (vm_flags & VM_DENYWRITE))) { - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) - return false; return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, HPAGE_PMD_NR); } @@ -513,7 +512,7 @@ void __khugepaged_exit(struct mm_struct *mm) static void release_pte_page(struct page *page) { - dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); + dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_lru(page)); unlock_page(page); putback_lru_page(page); } @@ -613,7 +612,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, goto out; } inc_node_page_state(page, - NR_ISOLATED_ANON + page_is_file_cache(page)); + NR_ISOLATED_ANON + page_is_file_lru(page)); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageLRU(page), page); @@ -1139,6 +1138,15 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, pte_t pteval = *_pte; if (is_swap_pte(pteval)) { if (++unmapped <= khugepaged_max_ptes_swap) { + /* + * Always be strict with uffd-wp + * enabled swap entries. Please see + * comment below for pte_uffd_wp(). + */ + if (pte_swp_uffd_wp(pteval)) { + result = SCAN_PTE_UFFD_WP; + goto out_unmap; + } continue; } else { result = SCAN_EXCEED_SWAP_PTE; @@ -1158,6 +1166,19 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, result = SCAN_PTE_NON_PRESENT; goto out_unmap; } + if (pte_uffd_wp(pteval)) { + /* + * Don't collapse the page if any of the small + * PTEs are armed with uffd write protection. + * Here we can also mark the new huge pmd as + * write protected if any of the small ones is + * marked but that could bring uknown + * userfault messages that falls outside of + * the registered range. So, just be simple. + */ + result = SCAN_PTE_UFFD_WP; + goto out_unmap; + } if (pte_write(pteval)) writable = true; @@ -1258,7 +1279,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot) } } -#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) +#ifdef CONFIG_SHMEM /* * Notify khugepaged that given addr of the mm is pte-mapped THP. Then * khugepaged should try to collapse the page table. @@ -1973,6 +1994,8 @@ skip: if (khugepaged_scan.address < hstart) khugepaged_scan.address = hstart; VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); + if (shmem_file(vma->vm_file) && !shmem_huge_enabled(vma)) + goto skip; while (khugepaged_scan.address < hend) { int ret; @@ -1984,14 +2007,10 @@ skip: khugepaged_scan.address + HPAGE_PMD_SIZE > hend); if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) { - struct file *file; + struct file *file = get_file(vma->vm_file); pgoff_t pgoff = linear_page_index(vma, khugepaged_scan.address); - if (shmem_file(vma->vm_file) - && !shmem_huge_enabled(vma)) - goto skip; - file = get_file(vma->vm_file); up_read(&mm->mmap_sem); ret = 1; khugepaged_scan_file(mm, file, pgoff, hpage); diff --git a/mm/ksm.c b/mm/ksm.c index d17c7d57d0d85648738ed9d38bcb38a72ceceec1..a558da9e717709e60cb31ab62a2ce9deebcb095c 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -455,7 +455,7 @@ static inline bool ksm_test_exit(struct mm_struct *mm) /* * We use break_ksm to break COW on a ksm page: it's a stripped down * - * if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1) + * if (get_user_pages(addr, 1, FOLL_WRITE, &page, NULL) == 1) * put_page(page); * * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, @@ -2813,8 +2813,7 @@ static int ksm_memory_callback(struct notifier_block *self, */ ksm_check_stable_tree(mn->start_pfn, mn->start_pfn + mn->nr_pages); - /* fallthrough */ - + fallthrough; case MEM_CANCEL_OFFLINE: mutex_lock(&ksm_thread_mutex); ksm_run &= ~KSM_RUN_OFFLINE; diff --git a/mm/list_lru.c b/mm/list_lru.c index 8de5e3784ee4699e35007504d5f94ee91a75f2fc..4d5294c39bba7c0ecbc7f522e189dd4ce5a760f5 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -223,7 +223,7 @@ restart: switch (ret) { case LRU_REMOVED_RETRY: assert_spin_locked(&nlru->lock); - /* fall through */ + fallthrough; case LRU_REMOVED: isolated++; nlru->nr_items--; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ca194864d8028b835e425f32a03e156f1fe9c2a6..05b4ec2c6499daff7f658a95b6d5ab5f9d5605c7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2254,7 +2254,8 @@ static void reclaim_high(struct mem_cgroup *memcg, continue; memcg_memory_event(memcg, MEMCG_HIGH); try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); - } while ((memcg = parent_mem_cgroup(memcg))); + } while ((memcg = parent_mem_cgroup(memcg)) && + !mem_cgroup_is_root(memcg)); } static void high_work_func(struct work_struct *work) @@ -5812,7 +5813,7 @@ retry: switch (get_mctgt_type(vma, addr, ptent, &target)) { case MC_TARGET_DEVICE: device = true; - /* fall through */ + fallthrough; case MC_TARGET_PAGE: page = target.page; /* diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 1c961cd26c0b0b5fb0dcef21ac1beff4b794bc97..a96364be8ab40066f7ee121a7895094872e28315 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1810,7 +1810,7 @@ static int __soft_offline_page(struct page *page, int flags) */ if (!__PageMovable(page)) inc_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + page_is_file_lru(page)); list_add(&page->lru, &pagelist); ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); diff --git a/mm/memory.c b/mm/memory.c index 586271f3efc6206f69e0aa8226ea51debd8fee5d..19874d133a66fc01d7b7d5b0e6c1c923a70d9c0d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -733,6 +733,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(*src_pte)) pte = pte_swp_mksoft_dirty(pte); + if (pte_swp_uffd_wp(*src_pte)) + pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } } else if (is_device_private_entry(entry)) { @@ -762,6 +764,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, is_cow_mapping(vm_flags)) { make_device_private_entry_read(&entry); pte = swp_entry_to_pte(entry); + if (pte_swp_uffd_wp(*src_pte)) + pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } } @@ -785,6 +789,14 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte = pte_mkclean(pte); pte = pte_mkold(pte); + /* + * Make sure the _PAGE_UFFD_WP bit is cleared if the new VMA + * does not have the VM_UFFD_WP, which means that the uffd + * fork event is not enabled. + */ + if (!(vm_flags & VM_UFFD_WP)) + pte = pte_clear_uffd_wp(pte); + page = vm_normal_page(vma, addr, pte); if (page) { get_page(page); @@ -1940,7 +1952,7 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, * @vma: user vma to map to * @addr: target user address to start at * @pfn: page frame number of kernel physical memory address - * @size: size of map area + * @size: size of mapping area * @prot: page protection flags for this mapping * * Note: this is only safe if the mm semaphore is held when called. @@ -2752,6 +2764,11 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; + if (userfaultfd_pte_wp(vma, *vmf->pte)) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + return handle_userfault(vmf, VM_UFFD_WP); + } + vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); if (!vmf->page) { /* @@ -3085,6 +3102,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) flush_icache_page(vma, page); if (pte_swp_soft_dirty(vmf->orig_pte)) pte = pte_mksoft_dirty(pte); + if (pte_swp_uffd_wp(vmf->orig_pte)) { + pte = pte_mkuffd_wp(pte); + pte = pte_wrprotect(pte); + } set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); vmf->orig_pte = pte; @@ -3373,7 +3394,7 @@ map_pte: return 0; } -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE static void deposit_prealloc_pte(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; @@ -3475,8 +3496,7 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, pte_t entry; vm_fault_t ret; - if (pmd_none(*vmf->pmd) && PageTransCompound(page) && - IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { + if (pmd_none(*vmf->pmd) && PageTransCompound(page)) { /* THP on COW? */ VM_BUG_ON_PAGE(memcg, page); @@ -3949,8 +3969,11 @@ static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) /* `inline' is required to avoid gcc 4.1.2 build error */ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) { - if (vma_is_anonymous(vmf->vma)) + if (vma_is_anonymous(vmf->vma)) { + if (userfaultfd_huge_pmd_wp(vmf->vma, orig_pmd)) + return handle_userfault(vmf, VM_UFFD_WP); return do_huge_pmd_wp_page(vmf, orig_pmd); + } if (vmf->vma->vm_ops->huge_fault) { vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); @@ -3964,11 +3987,6 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) return VM_FAULT_FALLBACK; } -static inline bool vma_is_accessible(struct vm_area_struct *vma) -{ - return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE); -} - static vm_fault_t create_huge_pud(struct vm_fault *vmf) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 19389cdc16a502f04726f4e12bab64cb8e4e1879..635e8e2865988ea384ce5d8141c593af8666bdf0 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -67,18 +67,17 @@ void put_online_mems(void) bool movable_node_enabled = false; #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE -bool memhp_auto_online; +int memhp_default_online_type = MMOP_OFFLINE; #else -bool memhp_auto_online = true; +int memhp_default_online_type = MMOP_ONLINE; #endif -EXPORT_SYMBOL_GPL(memhp_auto_online); static int __init setup_memhp_default_state(char *str) { - if (!strcmp(str, "online")) - memhp_auto_online = true; - else if (!strcmp(str, "offline")) - memhp_auto_online = false; + const int online_type = memhp_online_type_from_str(str); + + if (online_type >= 0) + memhp_default_online_type = online_type; return 1; } @@ -105,7 +104,13 @@ static struct resource *register_memory_resource(u64 start, u64 size) unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; char *resource_name = "System RAM"; - if (start + size > max_mem_size) + /* + * Make sure value parsed from 'mem=' only restricts memory adding + * while booting, so that memory hotplug won't be impacted. Please + * refer to document of 'mem=' in kernel-parameters.txt for more + * details. + */ + if (start + size > max_mem_size && system_state < SYSTEM_RUNNING) return ERR_PTR(-E2BIG); /* @@ -301,8 +306,9 @@ static int check_hotplug_memory_addressable(unsigned long pfn, int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, struct mhp_restrictions *restrictions) { + const unsigned long end_pfn = pfn + nr_pages; + unsigned long cur_nr_pages; int err; - unsigned long nr, start_sec, end_sec; struct vmem_altmap *altmap = restrictions->altmap; err = check_hotplug_memory_addressable(pfn, nr_pages); @@ -325,18 +331,13 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, if (err) return err; - start_sec = pfn_to_section_nr(pfn); - end_sec = pfn_to_section_nr(pfn + nr_pages - 1); - for (nr = start_sec; nr <= end_sec; nr++) { - unsigned long pfns; - - pfns = min(nr_pages, PAGES_PER_SECTION - - (pfn & ~PAGE_SECTION_MASK)); - err = sparse_add_section(nid, pfn, pfns, altmap); + for (; pfn < end_pfn; pfn += cur_nr_pages) { + /* Select all remaining pages up to the next section boundary */ + cur_nr_pages = min(end_pfn - pfn, + SECTION_ALIGN_UP(pfn + 1) - pfn); + err = sparse_add_section(nid, pfn, cur_nr_pages, altmap); if (err) break; - pfn += pfns; - nr_pages -= pfns; cond_resched(); } vmemmap_populate_print_last(); @@ -494,7 +495,7 @@ static void __remove_section(unsigned long pfn, unsigned long nr_pages, unsigned long map_offset, struct vmem_altmap *altmap) { - struct mem_section *ms = __nr_to_section(pfn_to_section_nr(pfn)); + struct mem_section *ms = __pfn_to_section(pfn); if (WARN_ON_ONCE(!valid_section(ms))) return; @@ -528,7 +529,8 @@ void __remove_pages(unsigned long pfn, unsigned long nr_pages, for (; pfn < end_pfn; pfn += cur_nr_pages) { cond_resched(); /* Select all remaining pages up to the next section boundary */ - cur_nr_pages = min(end_pfn - pfn, -(pfn | PAGE_SECTION_MASK)); + cur_nr_pages = min(end_pfn - pfn, + SECTION_ALIGN_UP(pfn + 1) - pfn); __remove_section(pfn, cur_nr_pages, map_offset, altmap); map_offset = 0; } @@ -988,6 +990,7 @@ static int check_hotplug_memory_range(u64 start, u64 size) static int online_memory_block(struct memory_block *mem, void *arg) { + mem->online_type = memhp_default_online_type; return device_online(&mem->dev); } @@ -1060,7 +1063,7 @@ int __ref add_memory_resource(int nid, struct resource *res) mem_hotplug_done(); /* online pages if requested */ - if (memhp_auto_online) + if (memhp_default_online_type != MMOP_OFFLINE) walk_memory_blocks(start, size, NULL, online_memory_block); return ret; @@ -1317,7 +1320,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) list_add_tail(&page->lru, &source); if (!__PageMovable(page)) inc_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + page_is_file_lru(page)); } else { pr_warn("failed to isolate pfn %lx\n", pfn); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 5fb427aed61279d066e603fb656b1b7c10e7c30f..b4a039d779b0a4adafe30764e775223e2ce0ca56 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -442,6 +442,7 @@ static inline bool queue_pages_required(struct page *page, */ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, unsigned long end, struct mm_walk *walk) + __releases(ptl) { int ret = 0; struct page *page; @@ -627,7 +628,7 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, { int nr_updated; - nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1); + nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA); if (nr_updated) count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); @@ -678,8 +679,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, if (flags & MPOL_MF_LAZY) { /* Similar to task_numa_work, skip inaccessible VMAs */ - if (!is_vm_hugetlb_page(vma) && - (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) && + if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) && !(vma->vm_flags & VM_MIXEDMAP)) change_prot_numa(vma, start, endvma); return 1; @@ -881,7 +881,6 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) switch (p->mode) { case MPOL_BIND: - /* Fall through */ case MPOL_INTERLEAVE: *nodes = p->v.nodes; break; @@ -1023,7 +1022,7 @@ static int migrate_page_add(struct page *page, struct list_head *pagelist, if (!isolate_lru_page(head)) { list_add_tail(&head->lru, pagelist); mod_node_page_state(page_pgdat(head), - NR_ISOLATED_ANON + page_is_file_cache(head), + NR_ISOLATED_ANON + page_is_file_lru(head), hpage_nr_pages(head)); } else if (flags & MPOL_MF_STRICT) { /* @@ -2066,7 +2065,6 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) break; case MPOL_BIND: - /* Fall through */ case MPOL_INTERLEAVE: *mask = mempolicy->v.nodes; break; @@ -2333,7 +2331,6 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) switch (a->mode) { case MPOL_BIND: - /* Fall through */ case MPOL_INTERLEAVE: return !!nodes_equal(a->v.nodes, b->v.nodes); case MPOL_PREFERRED: diff --git a/mm/migrate.c b/mm/migrate.c index 7ded07081be9a22153ba218f593a9504e1c092bd..7160c1556f797fa961b77065e47e57e628ebf3de 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -193,7 +193,7 @@ void putback_movable_pages(struct list_head *l) put_page(page); } else { mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + - page_is_file_cache(page), -hpage_nr_pages(page)); + page_is_file_lru(page), -hpage_nr_pages(page)); putback_lru_page(page); } } @@ -243,11 +243,15 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, entry = pte_to_swp_entry(*pvmw.pte); if (is_write_migration_entry(entry)) pte = maybe_mkwrite(pte, vma); + else if (pte_swp_uffd_wp(*pvmw.pte)) + pte = pte_mkuffd_wp(pte); if (unlikely(is_zone_device_page(new))) { if (is_device_private_page(new)) { entry = make_device_private_entry(new, pte_write(pte)); pte = swp_entry_to_pte(entry); + if (pte_swp_uffd_wp(*pvmw.pte)) + pte = pte_mkuffd_wp(pte); } } @@ -647,6 +651,14 @@ void migrate_page_states(struct page *newpage, struct page *page) if (PageWriteback(newpage)) end_page_writeback(newpage); + /* + * PG_readahead shares the same bit with PG_reclaim. The above + * end_page_writeback() may clear PG_readahead mistakenly, so set the + * bit after that. + */ + if (PageReadahead(page)) + SetPageReadahead(newpage); + copy_page_owner(page, newpage); mem_cgroup_migrate(page, newpage); @@ -1211,7 +1223,7 @@ out: */ if (likely(!__PageMovable(page))) mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + - page_is_file_cache(page), -hpage_nr_pages(page)); + page_is_file_lru(page), -hpage_nr_pages(page)); } /* @@ -1518,9 +1530,6 @@ static int do_move_pages_to_node(struct mm_struct *mm, { int err; - if (list_empty(pagelist)) - return 0; - err = migrate_pages(pagelist, alloc_new_node_page, NULL, node, MIGRATE_SYNC, MR_SYSCALL); if (err) @@ -1587,7 +1596,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, err = 1; list_add_tail(&head->lru, pagelist); mod_node_page_state(page_pgdat(head), - NR_ISOLATED_ANON + page_is_file_cache(head), + NR_ISOLATED_ANON + page_is_file_lru(head), hpage_nr_pages(head)); } out_putpage: @@ -1602,6 +1611,32 @@ out: return err; } +static int move_pages_and_store_status(struct mm_struct *mm, int node, + struct list_head *pagelist, int __user *status, + int start, int i, unsigned long nr_pages) +{ + int err; + + if (list_empty(pagelist)) + return 0; + + err = do_move_pages_to_node(mm, pagelist, node); + if (err) { + /* + * Positive err means the number of failed + * pages to migrate. Since we are going to + * abort and return the number of non-migrated + * pages, so need to incude the rest of the + * nr_pages that have not been attempted as + * well. + */ + if (err > 0) + err += nr_pages - i - 1; + return err; + } + return store_status(status, start, node, i - start); +} + /* * Migrate an array of page address onto an array of nodes and fill * the corresponding array of status. @@ -1645,21 +1680,8 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, current_node = node; start = i; } else if (node != current_node) { - err = do_move_pages_to_node(mm, &pagelist, current_node); - if (err) { - /* - * Positive err means the number of failed - * pages to migrate. Since we are going to - * abort and return the number of non-migrated - * pages, so need to incude the rest of the - * nr_pages that have not been attempted as - * well. - */ - if (err > 0) - err += nr_pages - i - 1; - goto out; - } - err = store_status(status, start, current_node, i - start); + err = move_pages_and_store_status(mm, current_node, + &pagelist, status, start, i, nr_pages); if (err) goto out; start = i; @@ -1673,49 +1695,29 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, err = add_page_for_migration(mm, addr, current_node, &pagelist, flags & MPOL_MF_MOVE_ALL); - if (!err) { - /* The page is already on the target node */ - err = store_status(status, i, current_node, 1); - if (err) - goto out_flush; - continue; - } else if (err > 0) { + if (err > 0) { /* The page is successfully queued for migration */ continue; } - err = store_status(status, i, err, 1); + /* + * If the page is already on the target node (!err), store the + * node, otherwise, store the err. + */ + err = store_status(status, i, err ? : current_node, 1); if (err) goto out_flush; - err = do_move_pages_to_node(mm, &pagelist, current_node); - if (err) { - if (err > 0) - err += nr_pages - i - 1; + err = move_pages_and_store_status(mm, current_node, &pagelist, + status, start, i, nr_pages); + if (err) goto out; - } - if (i > start) { - err = store_status(status, start, current_node, i - start); - if (err) - goto out; - } current_node = NUMA_NO_NODE; } out_flush: - if (list_empty(&pagelist)) - return err; - /* Make sure we do not overwrite the existing error */ - err1 = do_move_pages_to_node(mm, &pagelist, current_node); - /* - * Don't have to report non-attempted pages here since: - * - If the above loop is done gracefully all pages have been - * attempted. - * - If the above loop is aborted it means a fatal error - * happened, should return ret. - */ - if (!err1) - err1 = store_status(status, start, current_node, i - start); + err1 = move_pages_and_store_status(mm, current_node, &pagelist, + status, start, i, nr_pages); if (err >= 0) err = err1; out: @@ -1957,7 +1959,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) return 0; } - page_lru = page_is_file_cache(page); + page_lru = page_is_file_lru(page); mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru, hpage_nr_pages(page)); @@ -1993,7 +1995,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, * Don't migrate file pages that are mapped in multiple processes * with execute permissions as they are probably shared libraries. */ - if (page_mapcount(page) != 1 && page_is_file_cache(page) && + if (page_mapcount(page) != 1 && page_is_file_lru(page) && (vma->vm_flags & VM_EXEC)) goto out; @@ -2001,7 +2003,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, * Also do not migrate dirty pages as not all filesystems can move * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles. */ - if (page_is_file_cache(page) && PageDirty(page)) + if (page_is_file_lru(page) && PageDirty(page)) goto out; isolated = numamigrate_isolate_page(pgdat, page); @@ -2016,7 +2018,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, if (!list_empty(&migratepages)) { list_del(&page->lru); dec_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + page_is_file_lru(page)); putback_lru_page(page); } isolated = 0; @@ -2046,7 +2048,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, pg_data_t *pgdat = NODE_DATA(node); int isolated = 0; struct page *new_page = NULL; - int page_lru = page_is_file_cache(page); + int page_lru = page_is_file_lru(page); unsigned long start = address & HPAGE_PMD_MASK; new_page = alloc_pages_node(node, @@ -2340,6 +2342,8 @@ again: swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pte)) swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_uffd_wp(pte)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, addr, ptep, swp_pte); /* diff --git a/mm/mm_init.c b/mm/mm_init.c index 5c918388de993f18bb53e51af3f732b695d7e71d..7da6991d94359cebc28a55949b69802593451495 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -37,7 +37,7 @@ void __init mminit_verify_zonelist(void) struct zonelist *zonelist; int i, listid, zoneid; - BUG_ON(MAX_ZONELISTS > 2); + BUILD_BUG_ON(MAX_ZONELISTS > 2); for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) { /* Identify the zone and nodelist */ diff --git a/mm/mmap.c b/mm/mmap.c index 94ae18398c59d4b34011a38f2b905935ceb36ec9..8d77dbbb80fe724603e3aaf94d5857195aabb5d4 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1460,7 +1460,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, * with MAP_SHARED to preserve backward compatibility. */ flags &= LEGACY_MAP_MASK; - /* fall through */ + fallthrough; case MAP_SHARED_VALIDATE: if (flags & ~flags_mask) return -EOPNOTSUPP; @@ -1487,8 +1487,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags |= VM_SHARED | VM_MAYSHARE; if (!(file->f_mode & FMODE_WRITE)) vm_flags &= ~(VM_MAYWRITE | VM_SHARED); - - /* fall through */ + fallthrough; case MAP_PRIVATE: if (!(file->f_mode & FMODE_READ)) return -EACCES; @@ -2358,8 +2357,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) gap_addr = TASK_SIZE; next = vma->vm_next; - if (next && next->vm_start < gap_addr && - (next->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) { + if (next && next->vm_start < gap_addr && vma_is_accessible(next)) { if (!(next->vm_flags & VM_GROWSUP)) return -ENOMEM; /* Check that both stack segments have the same anon_vma? */ @@ -2440,7 +2438,7 @@ int expand_downwards(struct vm_area_struct *vma, prev = vma->vm_prev; /* Check that both stack segments have the same anon_vma? */ if (prev && !(prev->vm_flags & VM_GROWSDOWN) && - (prev->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) { + vma_is_accessible(prev)) { if (address - prev->vm_end < stack_guard_gap) return -ENOMEM; } diff --git a/mm/mprotect.c b/mm/mprotect.c index 311c0dadf71c9f5eb51b4d2004028fe0fe9cc4d8..1d823b0503299b79ea5ac602796994c335bb5520 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -37,12 +37,16 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa) + unsigned long cp_flags) { pte_t *pte, oldpte; spinlock_t *ptl; unsigned long pages = 0; int target_node = NUMA_NO_NODE; + bool dirty_accountable = cp_flags & MM_CP_DIRTY_ACCT; + bool prot_numa = cp_flags & MM_CP_PROT_NUMA; + bool uffd_wp = cp_flags & MM_CP_UFFD_WP; + bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; /* * Can be called with only the mmap_sem for reading by @@ -98,7 +102,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, * it cannot move them all from MIGRATE_ASYNC * context. */ - if (page_is_file_cache(page) && PageDirty(page)) + if (page_is_file_lru(page) && PageDirty(page)) continue; /* @@ -114,6 +118,19 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (preserve_write) ptent = pte_mk_savedwrite(ptent); + if (uffd_wp) { + ptent = pte_wrprotect(ptent); + ptent = pte_mkuffd_wp(ptent); + } else if (uffd_wp_resolve) { + /* + * Leave the write bit to be handled + * by PF interrupt handler, then + * things like COW could be properly + * handled. + */ + ptent = pte_clear_uffd_wp(ptent); + } + /* Avoid taking write faults for known dirty pages */ if (dirty_accountable && pte_dirty(ptent) && (pte_soft_dirty(ptent) || @@ -122,11 +139,11 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, } ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent); pages++; - } else if (IS_ENABLED(CONFIG_MIGRATION)) { + } else if (is_swap_pte(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); + pte_t newpte; if (is_write_migration_entry(entry)) { - pte_t newpte; /* * A protection check is difficult so * just be safe and disable write @@ -135,22 +152,28 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, newpte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(oldpte)) newpte = pte_swp_mksoft_dirty(newpte); - set_pte_at(vma->vm_mm, addr, pte, newpte); - - pages++; - } - - if (is_write_device_private_entry(entry)) { - pte_t newpte; - + if (pte_swp_uffd_wp(oldpte)) + newpte = pte_swp_mkuffd_wp(newpte); + } else if (is_write_device_private_entry(entry)) { /* * We do not preserve soft-dirtiness. See * copy_one_pte() for explanation. */ make_device_private_entry_read(&entry); newpte = swp_entry_to_pte(entry); - set_pte_at(vma->vm_mm, addr, pte, newpte); + if (pte_swp_uffd_wp(oldpte)) + newpte = pte_swp_mkuffd_wp(newpte); + } else { + newpte = oldpte; + } + + if (uffd_wp) + newpte = pte_swp_mkuffd_wp(newpte); + else if (uffd_wp_resolve) + newpte = pte_swp_clear_uffd_wp(newpte); + if (!pte_same(oldpte, newpte)) { + set_pte_at(vma->vm_mm, addr, pte, newpte); pages++; } } @@ -188,7 +211,7 @@ static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd) static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, - pgprot_t newprot, int dirty_accountable, int prot_numa) + pgprot_t newprot, unsigned long cp_flags) { pmd_t *pmd; unsigned long next; @@ -229,7 +252,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, __split_huge_pmd(vma, pmd, addr, false, NULL); } else { int nr_ptes = change_huge_pmd(vma, pmd, addr, - newprot, prot_numa); + newprot, cp_flags); if (nr_ptes) { if (nr_ptes == HPAGE_PMD_NR) { @@ -244,7 +267,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, /* fall through, the trans huge pmd just split */ } this_pages = change_pte_range(vma, pmd, addr, next, newprot, - dirty_accountable, prot_numa); + cp_flags); pages += this_pages; next: cond_resched(); @@ -260,7 +283,7 @@ next: static inline unsigned long change_pud_range(struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, - pgprot_t newprot, int dirty_accountable, int prot_numa) + pgprot_t newprot, unsigned long cp_flags) { pud_t *pud; unsigned long next; @@ -272,7 +295,7 @@ static inline unsigned long change_pud_range(struct vm_area_struct *vma, if (pud_none_or_clear_bad(pud)) continue; pages += change_pmd_range(vma, pud, addr, next, newprot, - dirty_accountable, prot_numa); + cp_flags); } while (pud++, addr = next, addr != end); return pages; @@ -280,7 +303,7 @@ static inline unsigned long change_pud_range(struct vm_area_struct *vma, static inline unsigned long change_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, - pgprot_t newprot, int dirty_accountable, int prot_numa) + pgprot_t newprot, unsigned long cp_flags) { p4d_t *p4d; unsigned long next; @@ -292,7 +315,7 @@ static inline unsigned long change_p4d_range(struct vm_area_struct *vma, if (p4d_none_or_clear_bad(p4d)) continue; pages += change_pud_range(vma, p4d, addr, next, newprot, - dirty_accountable, prot_numa); + cp_flags); } while (p4d++, addr = next, addr != end); return pages; @@ -300,7 +323,7 @@ static inline unsigned long change_p4d_range(struct vm_area_struct *vma, static unsigned long change_protection_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa) + unsigned long cp_flags) { struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; @@ -317,7 +340,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, if (pgd_none_or_clear_bad(pgd)) continue; pages += change_p4d_range(vma, pgd, addr, next, newprot, - dirty_accountable, prot_numa); + cp_flags); } while (pgd++, addr = next, addr != end); /* Only flush the TLB if we actually modified any entries: */ @@ -330,14 +353,17 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa) + unsigned long cp_flags) { unsigned long pages; + BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL); + if (is_vm_hugetlb_page(vma)) pages = hugetlb_change_protection(vma, start, end, newprot); else - pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa); + pages = change_protection_range(vma, start, end, newprot, + cp_flags); return pages; } @@ -459,7 +485,7 @@ success: vma_set_page_prot(vma); change_protection(vma, start, end, vma->vm_page_prot, - dirty_accountable, 0); + dirty_accountable ? MM_CP_DIRTY_ACCT : 0); /* * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e5f76da8cd4e753fac9e466a1bd9531b2e0dabca..114c56c3685d9f7bf24d9127355c6821cd64d321 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -74,6 +74,7 @@ #include <asm/div64.h> #include "internal.h" #include "shuffle.h" +#include "page_reporting.h" /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); @@ -864,6 +865,78 @@ compaction_capture(struct capture_control *capc, struct page *page, } #endif /* CONFIG_COMPACTION */ +/* Used for pages not on another list */ +static inline void add_to_free_list(struct page *page, struct zone *zone, + unsigned int order, int migratetype) +{ + struct free_area *area = &zone->free_area[order]; + + list_add(&page->lru, &area->free_list[migratetype]); + area->nr_free++; +} + +/* Used for pages not on another list */ +static inline void add_to_free_list_tail(struct page *page, struct zone *zone, + unsigned int order, int migratetype) +{ + struct free_area *area = &zone->free_area[order]; + + list_add_tail(&page->lru, &area->free_list[migratetype]); + area->nr_free++; +} + +/* Used for pages which are on another list */ +static inline void move_to_free_list(struct page *page, struct zone *zone, + unsigned int order, int migratetype) +{ + struct free_area *area = &zone->free_area[order]; + + list_move(&page->lru, &area->free_list[migratetype]); +} + +static inline void del_page_from_free_list(struct page *page, struct zone *zone, + unsigned int order) +{ + /* clear reported state and update reported page count */ + if (page_reported(page)) + __ClearPageReported(page); + + list_del(&page->lru); + __ClearPageBuddy(page); + set_page_private(page, 0); + zone->free_area[order].nr_free--; +} + +/* + * If this is not the largest possible page, check if the buddy + * of the next-highest order is free. If it is, it's possible + * that pages are being freed that will coalesce soon. In case, + * that is happening, add the free page to the tail of the list + * so it's less likely to be used soon and more likely to be merged + * as a higher order page + */ +static inline bool +buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn, + struct page *page, unsigned int order) +{ + struct page *higher_page, *higher_buddy; + unsigned long combined_pfn; + + if (order >= MAX_ORDER - 2) + return false; + + if (!pfn_valid_within(buddy_pfn)) + return false; + + combined_pfn = buddy_pfn & pfn; + higher_page = page + (combined_pfn - pfn); + buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); + higher_buddy = higher_page + (buddy_pfn - combined_pfn); + + return pfn_valid_within(buddy_pfn) && + page_is_buddy(higher_page, higher_buddy, order + 1); +} + /* * Freeing function for a buddy system allocator. * @@ -891,13 +964,14 @@ compaction_capture(struct capture_control *capc, struct page *page, static inline void __free_one_page(struct page *page, unsigned long pfn, struct zone *zone, unsigned int order, - int migratetype) + int migratetype, bool report) { - unsigned long combined_pfn; + struct capture_control *capc = task_capc(zone); unsigned long uninitialized_var(buddy_pfn); - struct page *buddy; + unsigned long combined_pfn; unsigned int max_order; - struct capture_control *capc = task_capc(zone); + struct page *buddy; + bool to_tail; max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); @@ -932,7 +1006,7 @@ continue_merging: if (page_is_guard(buddy)) clear_page_guard(zone, buddy, order, migratetype); else - del_page_from_free_area(buddy, &zone->free_area[order]); + del_page_from_free_list(buddy, zone, order); combined_pfn = buddy_pfn & pfn; page = page + (combined_pfn - pfn); pfn = combined_pfn; @@ -966,35 +1040,19 @@ continue_merging: done_merging: set_page_order(page, order); - /* - * If this is not the largest possible page, check if the buddy - * of the next-highest order is free. If it is, it's possible - * that pages are being freed that will coalesce soon. In case, - * that is happening, add the free page to the tail of the list - * so it's less likely to be used soon and more likely to be merged - * as a higher order page - */ - if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn) - && !is_shuffle_order(order)) { - struct page *higher_page, *higher_buddy; - combined_pfn = buddy_pfn & pfn; - higher_page = page + (combined_pfn - pfn); - buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); - higher_buddy = higher_page + (buddy_pfn - combined_pfn); - if (pfn_valid_within(buddy_pfn) && - page_is_buddy(higher_page, higher_buddy, order + 1)) { - add_to_free_area_tail(page, &zone->free_area[order], - migratetype); - return; - } - } - if (is_shuffle_order(order)) - add_to_free_area_random(page, &zone->free_area[order], - migratetype); + to_tail = shuffle_pick_tail(); + else + to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order); + + if (to_tail) + add_to_free_list_tail(page, zone, order, migratetype); else - add_to_free_area(page, &zone->free_area[order], migratetype); + add_to_free_list(page, zone, order, migratetype); + /* Notify page reporting subsystem of freed page */ + if (report) + page_reporting_notify_free(order); } /* @@ -1311,7 +1369,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, if (unlikely(isolated_pageblocks)) mt = get_pageblock_migratetype(page); - __free_one_page(page, page_to_pfn(page), zone, 0, mt); + __free_one_page(page, page_to_pfn(page), zone, 0, mt, true); trace_mm_page_pcpu_drain(page, 0, mt); } spin_unlock(&zone->lock); @@ -1327,7 +1385,7 @@ static void free_one_page(struct zone *zone, is_migrate_isolate(migratetype))) { migratetype = get_pfnblock_migratetype(page, pfn); } - __free_one_page(page, pfn, zone, order, migratetype); + __free_one_page(page, pfn, zone, order, migratetype, true); spin_unlock(&zone->lock); } @@ -2008,13 +2066,11 @@ void __init init_cma_reserved_pageblock(struct page *page) * -- nyc */ static inline void expand(struct zone *zone, struct page *page, - int low, int high, struct free_area *area, - int migratetype) + int low, int high, int migratetype) { unsigned long size = 1 << high; while (high > low) { - area--; high--; size >>= 1; VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); @@ -2028,7 +2084,7 @@ static inline void expand(struct zone *zone, struct page *page, if (set_page_guard(zone, &page[size], high, migratetype)) continue; - add_to_free_area(&page[size], area, migratetype); + add_to_free_list(&page[size], zone, high, migratetype); set_page_order(&page[size], high); } } @@ -2186,8 +2242,8 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, page = get_page_from_free_area(area, migratetype); if (!page) continue; - del_page_from_free_area(page, area); - expand(zone, page, order, current_order, area, migratetype); + del_page_from_free_list(page, zone, current_order); + expand(zone, page, order, current_order, migratetype); set_pcppage_migratetype(page, migratetype); return page; } @@ -2261,7 +2317,7 @@ static int move_freepages(struct zone *zone, VM_BUG_ON_PAGE(page_zone(page) != zone, page); order = page_order(page); - move_to_free_area(page, &zone->free_area[order], migratetype); + move_to_free_list(page, zone, order, migratetype); page += 1 << order; pages_moved += 1 << order; } @@ -2377,7 +2433,6 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, unsigned int alloc_flags, int start_type, bool whole_block) { unsigned int current_order = page_order(page); - struct free_area *area; int free_pages, movable_pages, alike_pages; int old_block_type; @@ -2448,8 +2503,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, return; single_page: - area = &zone->free_area[current_order]; - move_to_free_area(page, area, start_type); + move_to_free_list(page, zone, current_order, start_type); } /* @@ -3120,7 +3174,6 @@ EXPORT_SYMBOL_GPL(split_page); int __isolate_free_page(struct page *page, unsigned int order) { - struct free_area *area = &page_zone(page)->free_area[order]; unsigned long watermark; struct zone *zone; int mt; @@ -3146,7 +3199,7 @@ int __isolate_free_page(struct page *page, unsigned int order) /* Remove page from free list */ - del_page_from_free_area(page, area); + del_page_from_free_list(page, zone, order); /* * Set the pageblock if the isolated page is at least half of a @@ -3167,6 +3220,25 @@ int __isolate_free_page(struct page *page, unsigned int order) return 1UL << order; } +/** + * __putback_isolated_page - Return a now-isolated page back where we got it + * @page: Page that was isolated + * @order: Order of the isolated page + * + * This function is meant to return a page pulled from the free lists via + * __isolate_free_page back to the free lists they were pulled from. + */ +void __putback_isolated_page(struct page *page, unsigned int order, int mt) +{ + struct zone *zone = page_zone(page); + + /* zone lock should be held when this function is called */ + lockdep_assert_held(&zone->lock); + + /* Return isolated page to tail of freelist. */ + __free_one_page(page, page_to_pfn(page), zone, order, mt, false); +} + /* * Update NUMA hit/miss statistics * @@ -8713,7 +8785,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) BUG_ON(!PageBuddy(page)); order = page_order(page); offlined_pages += 1 << order; - del_page_from_free_area(page, &zone->free_area[order]); + del_page_from_free_list(page, zone, order); pfn += (1 << order); } spin_unlock_irqrestore(&zone->lock, flags); diff --git a/mm/page_ext.c b/mm/page_ext.c index 08ded037f89f655d0997214ec665c3ea4c48ae95..a3616f7a0e9e923c2add3a8459c8b2a82da524f7 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -303,11 +303,8 @@ static int __meminit online_page_ext(unsigned long start_pfn, VM_BUG_ON(!node_state(nid, N_ONLINE)); } - for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { - if (!pfn_in_present_section(pfn)) - continue; + for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) fail = init_section_page_ext(pfn, nid); - } if (!fail) return 0; diff --git a/mm/page_isolation.c b/mm/page_isolation.c index a9fd7c740c23894bc94e57fc380778880f923722..2c11a38d6e8754c4a3c8cc7b58f415ac453e8a92 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -117,13 +117,11 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) __mod_zone_freepage_state(zone, nr_pages, migratetype); } set_pageblock_migratetype(page, migratetype); + if (isolated_page) + __putback_isolated_page(page, order, migratetype); zone->nr_isolate_pageblock--; out: spin_unlock_irqrestore(&zone->lock, flags); - if (isolated_page) { - post_alloc_hook(page, order, __GFP_MOVABLE); - __free_pages(page, order); - } } static inline struct page * diff --git a/mm/page_reporting.c b/mm/page_reporting.c new file mode 100644 index 0000000000000000000000000000000000000000..3bbd471cfc81438a5f2935378be35c5860574bfe --- /dev/null +++ b/mm/page_reporting.c @@ -0,0 +1,364 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/mm.h> +#include <linux/mmzone.h> +#include <linux/page_reporting.h> +#include <linux/gfp.h> +#include <linux/export.h> +#include <linux/delay.h> +#include <linux/scatterlist.h> + +#include "page_reporting.h" +#include "internal.h" + +#define PAGE_REPORTING_DELAY (2 * HZ) +static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly; + +enum { + PAGE_REPORTING_IDLE = 0, + PAGE_REPORTING_REQUESTED, + PAGE_REPORTING_ACTIVE +}; + +/* request page reporting */ +static void +__page_reporting_request(struct page_reporting_dev_info *prdev) +{ + unsigned int state; + + /* Check to see if we are in desired state */ + state = atomic_read(&prdev->state); + if (state == PAGE_REPORTING_REQUESTED) + return; + + /* + * If reporting is already active there is nothing we need to do. + * Test against 0 as that represents PAGE_REPORTING_IDLE. + */ + state = atomic_xchg(&prdev->state, PAGE_REPORTING_REQUESTED); + if (state != PAGE_REPORTING_IDLE) + return; + + /* + * Delay the start of work to allow a sizable queue to build. For + * now we are limiting this to running no more than once every + * couple of seconds. + */ + schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY); +} + +/* notify prdev of free page reporting request */ +void __page_reporting_notify(void) +{ + struct page_reporting_dev_info *prdev; + + /* + * We use RCU to protect the pr_dev_info pointer. In almost all + * cases this should be present, however in the unlikely case of + * a shutdown this will be NULL and we should exit. + */ + rcu_read_lock(); + prdev = rcu_dereference(pr_dev_info); + if (likely(prdev)) + __page_reporting_request(prdev); + + rcu_read_unlock(); +} + +static void +page_reporting_drain(struct page_reporting_dev_info *prdev, + struct scatterlist *sgl, unsigned int nents, bool reported) +{ + struct scatterlist *sg = sgl; + + /* + * Drain the now reported pages back into their respective + * free lists/areas. We assume at least one page is populated. + */ + do { + struct page *page = sg_page(sg); + int mt = get_pageblock_migratetype(page); + unsigned int order = get_order(sg->length); + + __putback_isolated_page(page, order, mt); + + /* If the pages were not reported due to error skip flagging */ + if (!reported) + continue; + + /* + * If page was not comingled with another page we can + * consider the result to be "reported" since the page + * hasn't been modified, otherwise we will need to + * report on the new larger page when we make our way + * up to that higher order. + */ + if (PageBuddy(page) && page_order(page) == order) + __SetPageReported(page); + } while ((sg = sg_next(sg))); + + /* reinitialize scatterlist now that it is empty */ + sg_init_table(sgl, nents); +} + +/* + * The page reporting cycle consists of 4 stages, fill, report, drain, and + * idle. We will cycle through the first 3 stages until we cannot obtain a + * full scatterlist of pages, in that case we will switch to idle. + */ +static int +page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone, + unsigned int order, unsigned int mt, + struct scatterlist *sgl, unsigned int *offset) +{ + struct free_area *area = &zone->free_area[order]; + struct list_head *list = &area->free_list[mt]; + unsigned int page_len = PAGE_SIZE << order; + struct page *page, *next; + long budget; + int err = 0; + + /* + * Perform early check, if free area is empty there is + * nothing to process so we can skip this free_list. + */ + if (list_empty(list)) + return err; + + spin_lock_irq(&zone->lock); + + /* + * Limit how many calls we will be making to the page reporting + * device for this list. By doing this we avoid processing any + * given list for too long. + * + * The current value used allows us enough calls to process over a + * sixteenth of the current list plus one additional call to handle + * any pages that may have already been present from the previous + * list processed. This should result in us reporting all pages on + * an idle system in about 30 seconds. + * + * The division here should be cheap since PAGE_REPORTING_CAPACITY + * should always be a power of 2. + */ + budget = DIV_ROUND_UP(area->nr_free, PAGE_REPORTING_CAPACITY * 16); + + /* loop through free list adding unreported pages to sg list */ + list_for_each_entry_safe(page, next, list, lru) { + /* We are going to skip over the reported pages. */ + if (PageReported(page)) + continue; + + /* + * If we fully consumed our budget then update our + * state to indicate that we are requesting additional + * processing and exit this list. + */ + if (budget < 0) { + atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED); + next = page; + break; + } + + /* Attempt to pull page from list and place in scatterlist */ + if (*offset) { + if (!__isolate_free_page(page, order)) { + next = page; + break; + } + + /* Add page to scatter list */ + --(*offset); + sg_set_page(&sgl[*offset], page, page_len, 0); + + continue; + } + + /* + * Make the first non-reported page in the free list + * the new head of the free list before we release the + * zone lock. + */ + if (&page->lru != list && !list_is_first(&page->lru, list)) + list_rotate_to_front(&page->lru, list); + + /* release lock before waiting on report processing */ + spin_unlock_irq(&zone->lock); + + /* begin processing pages in local list */ + err = prdev->report(prdev, sgl, PAGE_REPORTING_CAPACITY); + + /* reset offset since the full list was reported */ + *offset = PAGE_REPORTING_CAPACITY; + + /* update budget to reflect call to report function */ + budget--; + + /* reacquire zone lock and resume processing */ + spin_lock_irq(&zone->lock); + + /* flush reported pages from the sg list */ + page_reporting_drain(prdev, sgl, PAGE_REPORTING_CAPACITY, !err); + + /* + * Reset next to first entry, the old next isn't valid + * since we dropped the lock to report the pages + */ + next = list_first_entry(list, struct page, lru); + + /* exit on error */ + if (err) + break; + } + + /* Rotate any leftover pages to the head of the freelist */ + if (&next->lru != list && !list_is_first(&next->lru, list)) + list_rotate_to_front(&next->lru, list); + + spin_unlock_irq(&zone->lock); + + return err; +} + +static int +page_reporting_process_zone(struct page_reporting_dev_info *prdev, + struct scatterlist *sgl, struct zone *zone) +{ + unsigned int order, mt, leftover, offset = PAGE_REPORTING_CAPACITY; + unsigned long watermark; + int err = 0; + + /* Generate minimum watermark to be able to guarantee progress */ + watermark = low_wmark_pages(zone) + + (PAGE_REPORTING_CAPACITY << PAGE_REPORTING_MIN_ORDER); + + /* + * Cancel request if insufficient free memory or if we failed + * to allocate page reporting statistics for the zone. + */ + if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) + return err; + + /* Process each free list starting from lowest order/mt */ + for (order = PAGE_REPORTING_MIN_ORDER; order < MAX_ORDER; order++) { + for (mt = 0; mt < MIGRATE_TYPES; mt++) { + /* We do not pull pages from the isolate free list */ + if (is_migrate_isolate(mt)) + continue; + + err = page_reporting_cycle(prdev, zone, order, mt, + sgl, &offset); + if (err) + return err; + } + } + + /* report the leftover pages before going idle */ + leftover = PAGE_REPORTING_CAPACITY - offset; + if (leftover) { + sgl = &sgl[offset]; + err = prdev->report(prdev, sgl, leftover); + + /* flush any remaining pages out from the last report */ + spin_lock_irq(&zone->lock); + page_reporting_drain(prdev, sgl, leftover, !err); + spin_unlock_irq(&zone->lock); + } + + return err; +} + +static void page_reporting_process(struct work_struct *work) +{ + struct delayed_work *d_work = to_delayed_work(work); + struct page_reporting_dev_info *prdev = + container_of(d_work, struct page_reporting_dev_info, work); + int err = 0, state = PAGE_REPORTING_ACTIVE; + struct scatterlist *sgl; + struct zone *zone; + + /* + * Change the state to "Active" so that we can track if there is + * anyone requests page reporting after we complete our pass. If + * the state is not altered by the end of the pass we will switch + * to idle and quit scheduling reporting runs. + */ + atomic_set(&prdev->state, state); + + /* allocate scatterlist to store pages being reported on */ + sgl = kmalloc_array(PAGE_REPORTING_CAPACITY, sizeof(*sgl), GFP_KERNEL); + if (!sgl) + goto err_out; + + sg_init_table(sgl, PAGE_REPORTING_CAPACITY); + + for_each_zone(zone) { + err = page_reporting_process_zone(prdev, sgl, zone); + if (err) + break; + } + + kfree(sgl); +err_out: + /* + * If the state has reverted back to requested then there may be + * additional pages to be processed. We will defer for 2s to allow + * more pages to accumulate. + */ + state = atomic_cmpxchg(&prdev->state, state, PAGE_REPORTING_IDLE); + if (state == PAGE_REPORTING_REQUESTED) + schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY); +} + +static DEFINE_MUTEX(page_reporting_mutex); +DEFINE_STATIC_KEY_FALSE(page_reporting_enabled); + +int page_reporting_register(struct page_reporting_dev_info *prdev) +{ + int err = 0; + + mutex_lock(&page_reporting_mutex); + + /* nothing to do if already in use */ + if (rcu_access_pointer(pr_dev_info)) { + err = -EBUSY; + goto err_out; + } + + /* initialize state and work structures */ + atomic_set(&prdev->state, PAGE_REPORTING_IDLE); + INIT_DELAYED_WORK(&prdev->work, &page_reporting_process); + + /* Begin initial flush of zones */ + __page_reporting_request(prdev); + + /* Assign device to allow notifications */ + rcu_assign_pointer(pr_dev_info, prdev); + + /* enable page reporting notification */ + if (!static_key_enabled(&page_reporting_enabled)) { + static_branch_enable(&page_reporting_enabled); + pr_info("Free page reporting enabled\n"); + } +err_out: + mutex_unlock(&page_reporting_mutex); + + return err; +} +EXPORT_SYMBOL_GPL(page_reporting_register); + +void page_reporting_unregister(struct page_reporting_dev_info *prdev) +{ + mutex_lock(&page_reporting_mutex); + + if (rcu_access_pointer(pr_dev_info) == prdev) { + /* Disable page reporting notification */ + RCU_INIT_POINTER(pr_dev_info, NULL); + synchronize_rcu(); + + /* Flush any existing work, and lock it out */ + cancel_delayed_work_sync(&prdev->work); + } + + mutex_unlock(&page_reporting_mutex); +} +EXPORT_SYMBOL_GPL(page_reporting_unregister); diff --git a/mm/page_reporting.h b/mm/page_reporting.h new file mode 100644 index 0000000000000000000000000000000000000000..aa6d37f4dc22b86cbf4e3a04db59644e438e76b0 --- /dev/null +++ b/mm/page_reporting.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _MM_PAGE_REPORTING_H +#define _MM_PAGE_REPORTING_H + +#include <linux/mmzone.h> +#include <linux/pageblock-flags.h> +#include <linux/page-isolation.h> +#include <linux/jump_label.h> +#include <linux/slab.h> +#include <asm/pgtable.h> +#include <linux/scatterlist.h> + +#define PAGE_REPORTING_MIN_ORDER pageblock_order + +#ifdef CONFIG_PAGE_REPORTING +DECLARE_STATIC_KEY_FALSE(page_reporting_enabled); +void __page_reporting_notify(void); + +static inline bool page_reported(struct page *page) +{ + return static_branch_unlikely(&page_reporting_enabled) && + PageReported(page); +} + +/** + * page_reporting_notify_free - Free page notification to start page processing + * + * This function is meant to act as a screener for __page_reporting_notify + * which will determine if a give zone has crossed over the high-water mark + * that will justify us beginning page treatment. If we have crossed that + * threshold then it will start the process of pulling some pages and + * placing them in the batch list for treatment. + */ +static inline void page_reporting_notify_free(unsigned int order) +{ + /* Called from hot path in __free_one_page() */ + if (!static_branch_unlikely(&page_reporting_enabled)) + return; + + /* Determine if we have crossed reporting threshold */ + if (order < PAGE_REPORTING_MIN_ORDER) + return; + + /* This will add a few cycles, but should be called infrequently */ + __page_reporting_notify(); +} +#else /* CONFIG_PAGE_REPORTING */ +#define page_reported(_page) false + +static inline void page_reporting_notify_free(unsigned int order) +{ +} +#endif /* CONFIG_PAGE_REPORTING */ +#endif /*_MM_PAGE_REPORTING_H */ diff --git a/mm/rmap.c b/mm/rmap.c index 2df75a119c831b92eeb8d2e5b49632ac7262f74b..f79a206b271a66e7a7b63491c893c8ca9d37d3c5 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -275,19 +275,6 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) { struct anon_vma_chain *avc, *pavc; struct anon_vma *root = NULL; - struct vm_area_struct *prev = dst->vm_prev, *pprev = src->vm_prev; - - /* - * If parent share anon_vma with its vm_prev, keep this sharing in in - * child. - * - * 1. Parent has vm_prev, which implies we have vm_prev. - * 2. Parent and its vm_prev have the same anon_vma. - */ - if (!dst->anon_vma && src->anon_vma && - pprev && pprev->anon_vma == src->anon_vma) - dst->anon_vma = prev->anon_vma; - list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { struct anon_vma *anon_vma; @@ -946,7 +933,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, set_pte_at(vma->vm_mm, address, pte, entry); ret = 1; } else { -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE pmd_t *pmd = pvmw.pmd; pmd_t entry; @@ -1385,7 +1372,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, struct page *subpage; bool ret = true; struct mmu_notifier_range range; - enum ttu_flags flags = (enum ttu_flags)arg; + enum ttu_flags flags = (enum ttu_flags)(long)arg; /* munlock has nothing to gain from examining un-locked vmas */ if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) @@ -1515,6 +1502,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_uffd_wp(pteval)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); /* * No need to invalidate here it will synchronize on @@ -1614,6 +1603,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_uffd_wp(pteval)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, address, pvmw.pte, swp_pte); /* * No need to invalidate here it will synchronize on @@ -1680,6 +1671,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_uffd_wp(pteval)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, address, pvmw.pte, swp_pte); /* Invalidate as we cleared the pte */ mmu_notifier_invalidate_range(mm, address, diff --git a/mm/shmem.c b/mm/shmem.c index f47347cb30f6b0b41684a549508b9d4972aec378..d722eb830317022d0b18392cdd7e61bc2d52195e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -410,7 +410,7 @@ static bool shmem_confirm_swap(struct address_space *mapping, #define SHMEM_HUGE_DENY (-1) #define SHMEM_HUGE_FORCE (-2) -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* ifdef here to avoid bloating shmem.o when not necessary */ static int shmem_huge __read_mostly; @@ -580,7 +580,7 @@ static long shmem_unused_huge_count(struct super_block *sb, struct shmem_sb_info *sbinfo = SHMEM_SB(sb); return READ_ONCE(sbinfo->shrinklist_len); } -#else /* !CONFIG_TRANSPARENT_HUGE_PAGECACHE */ +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */ #define shmem_huge SHMEM_HUGE_DENY @@ -589,11 +589,11 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, { return 0; } -#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */ +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo) { - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) && shmem_huge != SHMEM_HUGE_DENY) return true; @@ -788,6 +788,32 @@ void shmem_unlock_mapping(struct address_space *mapping) } } +/* + * Check whether a hole-punch or truncation needs to split a huge page, + * returning true if no split was required, or the split has been successful. + * + * Eviction (or truncation to 0 size) should never need to split a huge page; + * but in rare cases might do so, if shmem_undo_range() failed to trylock on + * head, and then succeeded to trylock on tail. + * + * A split can only succeed when there are no additional references on the + * huge page: so the split below relies upon find_get_entries() having stopped + * when it found a subpage of the huge page, without getting further references. + */ +static bool shmem_punch_compound(struct page *page, pgoff_t start, pgoff_t end) +{ + if (!PageTransCompound(page)) + return true; + + /* Just proceed to delete a huge page wholly within the range punched */ + if (PageHead(page) && + page->index >= start && page->index + HPAGE_PMD_NR <= end) + return true; + + /* Try to split huge page, so we can truly punch the hole or truncate */ + return split_huge_page(page) >= 0; +} + /* * Remove range of pages and swap entries from page cache, and free them. * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. @@ -838,31 +864,11 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (!trylock_page(page)) continue; - if (PageTransTail(page)) { - /* Middle of THP: zero out the page */ - clear_highpage(page); - unlock_page(page); - continue; - } else if (PageTransHuge(page)) { - if (index == round_down(end, HPAGE_PMD_NR)) { - /* - * Range ends in the middle of THP: - * zero out the page - */ - clear_highpage(page); - unlock_page(page); - continue; - } - index += HPAGE_PMD_NR - 1; - i += HPAGE_PMD_NR - 1; - } - - if (!unfalloc || !PageUptodate(page)) { - VM_BUG_ON_PAGE(PageTail(page), page); - if (page_mapping(page) == mapping) { - VM_BUG_ON_PAGE(PageWriteback(page), page); + if ((!unfalloc || !PageUptodate(page)) && + page_mapping(page) == mapping) { + VM_BUG_ON_PAGE(PageWriteback(page), page); + if (shmem_punch_compound(page, start, end)) truncate_inode_page(mapping, page); - } } unlock_page(page); } @@ -936,43 +942,25 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, lock_page(page); - if (PageTransTail(page)) { - /* Middle of THP: zero out the page */ - clear_highpage(page); - unlock_page(page); - /* - * Partial thp truncate due 'start' in middle - * of THP: don't need to look on these pages - * again on !pvec.nr restart. - */ - if (index != round_down(end, HPAGE_PMD_NR)) - start++; - continue; - } else if (PageTransHuge(page)) { - if (index == round_down(end, HPAGE_PMD_NR)) { - /* - * Range ends in the middle of THP: - * zero out the page - */ - clear_highpage(page); - unlock_page(page); - continue; - } - index += HPAGE_PMD_NR - 1; - i += HPAGE_PMD_NR - 1; - } - if (!unfalloc || !PageUptodate(page)) { - VM_BUG_ON_PAGE(PageTail(page), page); - if (page_mapping(page) == mapping) { - VM_BUG_ON_PAGE(PageWriteback(page), page); - truncate_inode_page(mapping, page); - } else { + if (page_mapping(page) != mapping) { /* Page was replaced by swap: retry */ unlock_page(page); index--; break; } + VM_BUG_ON_PAGE(PageWriteback(page), page); + if (shmem_punch_compound(page, start, end)) + truncate_inode_page(mapping, page); + else { + /* Wipe the page and don't get stuck */ + clear_highpage(page); + flush_dcache_page(page); + set_page_dirty(page); + if (index < + round_up(start, HPAGE_PMD_NR)) + start = index + 1; + } } unlock_page(page); } @@ -1059,7 +1047,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) * Part of the huge page can be beyond i_size: subject * to shrink under memory pressure. */ - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { spin_lock(&sbinfo->shrinklist_lock); /* * _careful to defend against unlocked access to @@ -1472,9 +1460,6 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, pgoff_t hindex; struct page *page; - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) - return NULL; - hindex = round_down(index, HPAGE_PMD_NR); if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1, XA_PRESENT)) @@ -1486,6 +1471,8 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, shmem_pseudo_vma_destroy(&pvma); if (page) prep_transhuge_page(page); + else + count_vm_event(THP_FILE_FALLBACK); return page; } @@ -1511,7 +1498,7 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp, int nr; int err = -ENOSPC; - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) huge = false; nr = huge ? HPAGE_PMD_NR : 1; @@ -1813,17 +1800,20 @@ repeat: if (shmem_huge == SHMEM_HUGE_FORCE) goto alloc_huge; switch (sbinfo->huge) { - loff_t i_size; - pgoff_t off; case SHMEM_HUGE_NEVER: goto alloc_nohuge; - case SHMEM_HUGE_WITHIN_SIZE: + case SHMEM_HUGE_WITHIN_SIZE: { + loff_t i_size; + pgoff_t off; + off = round_up(index, HPAGE_PMD_NR); i_size = round_up(i_size_read(inode), PAGE_SIZE); if (i_size >= HPAGE_PMD_SIZE && i_size >> PAGE_SHIFT >= off) goto alloc_huge; - /* fallthrough */ + + fallthrough; + } case SHMEM_HUGE_ADVISE: if (sgp_huge == SGP_HUGE) goto alloc_huge; @@ -1871,8 +1861,13 @@ alloc_nohuge: error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, PageTransHuge(page)); - if (error) + if (error) { + if (PageTransHuge(page)) { + count_vm_event(THP_FILE_FALLBACK); + count_vm_event(THP_FILE_FALLBACK_CHARGE); + } goto unacct; + } error = shmem_add_to_page_cache(page, mapping, hindex, NULL, gfp & GFP_RECLAIM_MASK); if (error) { @@ -2089,7 +2084,7 @@ unsigned long shmem_get_unmapped_area(struct file *file, get_area = current->mm->get_unmapped_area; addr = get_area(file, uaddr, len, pgoff, flags); - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return addr; if (IS_ERR_VALUE(addr)) return addr; @@ -2228,7 +2223,7 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); vma->vm_ops = &shmem_vm_ops; - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) < (vma->vm_end & HPAGE_PMD_MASK)) { khugepaged_enter(vma, vma->vm_flags); @@ -3113,12 +3108,9 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s error = security_inode_init_security(inode, dir, &dentry->d_name, shmem_initxattrs, NULL); - if (error) { - if (error != -EOPNOTSUPP) { - iput(inode); - return error; - } - error = 0; + if (error && error != -EOPNOTSUPP) { + iput(inode); + return error; } inode->i_size = len-1; @@ -3455,7 +3447,7 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) case Opt_huge: ctx->huge = result.uint_32; if (ctx->huge != SHMEM_HUGE_NEVER && - !(IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && + !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && has_transparent_hugepage())) goto unsupported_parameter; ctx->seen |= SHMEM_SEEN_HUGE; @@ -3601,7 +3593,7 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, sbinfo->gid)); -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */ if (sbinfo->huge) seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge)); @@ -3846,7 +3838,7 @@ static const struct super_operations shmem_ops = { .evict_inode = shmem_evict_inode, .drop_inode = generic_delete_inode, .put_super = shmem_put_super, -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE .nr_cached_objects = shmem_unused_huge_count, .free_cached_objects = shmem_unused_huge_scan, #endif @@ -3908,7 +3900,7 @@ int __init shmem_init(void) goto out1; } -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; else @@ -3924,7 +3916,7 @@ out2: return error; } -#if defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && defined(CONFIG_SYSFS) +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS) static ssize_t shmem_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -3976,9 +3968,9 @@ static ssize_t shmem_enabled_store(struct kobject *kobj, struct kobj_attribute shmem_enabled_attr = __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store); -#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */ +#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */ -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE bool shmem_huge_enabled(struct vm_area_struct *vma) { struct inode *inode = file_inode(vma->vm_file); @@ -4004,7 +3996,7 @@ bool shmem_huge_enabled(struct vm_area_struct *vma) if (i_size >= HPAGE_PMD_SIZE && i_size >> PAGE_SHIFT >= off) return true; - /* fall through */ + fallthrough; case SHMEM_HUGE_ADVISE: /* TODO: implement fadvise() hints */ return (vma->vm_flags & VM_HUGEPAGE); @@ -4013,7 +4005,7 @@ bool shmem_huge_enabled(struct vm_area_struct *vma) return false; } } -#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */ +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #else /* !CONFIG_SHMEM */ @@ -4182,7 +4174,7 @@ int shmem_zero_setup(struct vm_area_struct *vma) vma->vm_file = file; vma->vm_ops = &shmem_vm_ops; - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) < (vma->vm_end & HPAGE_PMD_MASK)) { khugepaged_enter(vma, vma->vm_flags); diff --git a/mm/shuffle.c b/mm/shuffle.c index c716059cbd3cca310c88597431ba44c4340362fd..44406d9977c7779ab347dda835608046882569d7 100644 --- a/mm/shuffle.c +++ b/mm/shuffle.c @@ -183,11 +183,11 @@ void __meminit __shuffle_free_memory(pg_data_t *pgdat) shuffle_zone(z); } -void add_to_free_area_random(struct page *page, struct free_area *area, - int migratetype) +bool shuffle_pick_tail(void) { static u64 rand; static u8 rand_bits; + bool ret; /* * The lack of locking is deliberate. If 2 threads race to @@ -198,10 +198,10 @@ void add_to_free_area_random(struct page *page, struct free_area *area, rand = get_random_u64(); } - if (rand & 1) - add_to_free_area(page, area, migratetype); - else - add_to_free_area_tail(page, area, migratetype); + ret = rand & 1; + rand_bits--; rand >>= 1; + + return ret; } diff --git a/mm/shuffle.h b/mm/shuffle.h index 777a257a0d2f9346f10e68db9da34789fd5a3711..4d79f03b6658ff29799a0b3f47c9a5de2d593475 100644 --- a/mm/shuffle.h +++ b/mm/shuffle.h @@ -22,6 +22,7 @@ enum mm_shuffle_ctl { DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key); extern void page_alloc_shuffle(enum mm_shuffle_ctl ctl); extern void __shuffle_free_memory(pg_data_t *pgdat); +extern bool shuffle_pick_tail(void); static inline void shuffle_free_memory(pg_data_t *pgdat) { if (!static_branch_unlikely(&page_alloc_shuffle_key)) @@ -44,6 +45,11 @@ static inline bool is_shuffle_order(int order) return order >= SHUFFLE_ORDER; } #else +static inline bool shuffle_pick_tail(void) +{ + return false; +} + static inline void shuffle_free_memory(pg_data_t *pgdat) { } diff --git a/mm/slab_common.c b/mm/slab_common.c index 5282f881d2f5171e47ba1bbf60b7fb3f4aa1040a..93ec4a574d8d5d4e7b9b0a6200cc24c55fc16211 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1581,6 +1581,7 @@ static int slabinfo_open(struct inode *inode, struct file *file) } static const struct proc_ops slabinfo_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = slabinfo_open, .proc_read = seq_read, .proc_write = slabinfo_write, diff --git a/mm/slub.c b/mm/slub.c index 3098e0cf28992a344fc3de08245d0db97dbb3060..332d4b459a907b6a25e8fef7452e26c61604c2ea 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -449,6 +449,7 @@ static DEFINE_SPINLOCK(object_map_lock); * not vanish from under us. */ static unsigned long *get_map(struct kmem_cache *s, struct page *page) + __acquires(&object_map_lock) { void *p; void *addr = page_address(page); @@ -465,7 +466,7 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page) return object_map; } -static void put_map(unsigned long *map) +static void put_map(unsigned long *map) __releases(&object_map_lock) { VM_BUG_ON(map != object_map); lockdep_assert_held(&object_map_lock); diff --git a/mm/sparse.c b/mm/sparse.c index f1af4d4ee80ba44a9c332826abc3591c75bed829..1aee5a4815715c13bb2c966fb487f807757fc5d9 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -209,6 +209,7 @@ static inline unsigned long first_present_section_nr(void) return next_present_section_nr(-1); } +#ifdef CONFIG_SPARSEMEM_VMEMMAP static void subsection_mask_set(unsigned long *map, unsigned long pfn, unsigned long nr_pages) { @@ -243,6 +244,11 @@ void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages) nr_pages -= pfns; } } +#else +void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages) +{ +} +#endif /* Record a memory area against a node. */ void __init memory_present(int nid, unsigned long start, unsigned long end) @@ -660,6 +666,55 @@ static void free_map_bootmem(struct page *memmap) vmemmap_free(start, end, NULL); } + +static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages) +{ + DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 }; + DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 }; + struct mem_section *ms = __pfn_to_section(pfn); + unsigned long *subsection_map = ms->usage + ? &ms->usage->subsection_map[0] : NULL; + + subsection_mask_set(map, pfn, nr_pages); + if (subsection_map) + bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION); + + if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION), + "section already deactivated (%#lx + %ld)\n", + pfn, nr_pages)) + return -EINVAL; + + bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION); + return 0; +} + +static bool is_subsection_map_empty(struct mem_section *ms) +{ + return bitmap_empty(&ms->usage->subsection_map[0], + SUBSECTIONS_PER_SECTION); +} + +static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages) +{ + struct mem_section *ms = __pfn_to_section(pfn); + DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 }; + unsigned long *subsection_map; + int rc = 0; + + subsection_mask_set(map, pfn, nr_pages); + + subsection_map = &ms->usage->subsection_map[0]; + + if (bitmap_empty(map, SUBSECTIONS_PER_SECTION)) + rc = -EINVAL; + else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION)) + rc = -EEXIST; + else + bitmap_or(subsection_map, map, subsection_map, + SUBSECTIONS_PER_SECTION); + + return rc; +} #else struct page * __meminit populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap) @@ -703,48 +758,51 @@ static void free_map_bootmem(struct page *memmap) put_page_bootmem(page); } } + +static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages) +{ + return 0; +} + +static bool is_subsection_map_empty(struct mem_section *ms) +{ + return true; +} + +static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages) +{ + return 0; +} #endif /* CONFIG_SPARSEMEM_VMEMMAP */ +/* + * To deactivate a memory region, there are 3 cases to handle across + * two configurations (SPARSEMEM_VMEMMAP={y,n}): + * + * 1. deactivation of a partial hot-added section (only possible in + * the SPARSEMEM_VMEMMAP=y case). + * a) section was present at memory init. + * b) section was hot-added post memory init. + * 2. deactivation of a complete hot-added section. + * 3. deactivation of a complete section from memory init. + * + * For 1, when subsection_map does not empty we will not be freeing the + * usage map, but still need to free the vmemmap range. + * + * For 2 and 3, the SPARSEMEM_VMEMMAP={y,n} cases are unified + */ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap) { - DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 }; - DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 }; struct mem_section *ms = __pfn_to_section(pfn); bool section_is_early = early_section(ms); struct page *memmap = NULL; bool empty; - unsigned long *subsection_map = ms->usage - ? &ms->usage->subsection_map[0] : NULL; - - subsection_mask_set(map, pfn, nr_pages); - if (subsection_map) - bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION); - if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION), - "section already deactivated (%#lx + %ld)\n", - pfn, nr_pages)) + if (clear_subsection_map(pfn, nr_pages)) return; - /* - * There are 3 cases to handle across two configurations - * (SPARSEMEM_VMEMMAP={y,n}): - * - * 1/ deactivation of a partial hot-added section (only possible - * in the SPARSEMEM_VMEMMAP=y case). - * a/ section was present at memory init - * b/ section was hot-added post memory init - * 2/ deactivation of a complete hot-added section - * 3/ deactivation of a complete section from memory init - * - * For 1/, when subsection_map does not empty we will not be - * freeing the usage map, but still need to free the vmemmap - * range. - * - * For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified - */ - bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION); - empty = bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION); + empty = is_subsection_map_empty(ms); if (empty) { unsigned long section_nr = pfn_to_section_nr(pfn); @@ -780,31 +838,19 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, static struct page * __meminit section_activate(int nid, unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap) { - DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 }; struct mem_section *ms = __pfn_to_section(pfn); struct mem_section_usage *usage = NULL; - unsigned long *subsection_map; struct page *memmap; int rc = 0; - subsection_mask_set(map, pfn, nr_pages); - if (!ms->usage) { usage = kzalloc(mem_section_usage_size(), GFP_KERNEL); if (!usage) return ERR_PTR(-ENOMEM); ms->usage = usage; } - subsection_map = &ms->usage->subsection_map[0]; - - if (bitmap_empty(map, SUBSECTIONS_PER_SECTION)) - rc = -EINVAL; - else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION)) - rc = -EEXIST; - else - bitmap_or(subsection_map, map, subsection_map, - SUBSECTIONS_PER_SECTION); + rc = fill_subsection_map(pfn, nr_pages); if (rc) { if (usage) ms->usage = NULL; @@ -840,6 +886,10 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn, * * This is only intended for hotplug. * + * Note that only VMEMMAP supports sub-section aligned hotplug, + * the proper alignment and size are gated by check_pfn_span(). + * + * * Return: * * 0 - On success. * * -EEXIST - Section has been present. diff --git a/mm/swap.c b/mm/swap.c index a4af8c999963568fc126205cf93bd5693fed1343..bf9a79fed62d7aed216ed2dd38f6e3557bd49172 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -276,7 +276,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec, void *arg) { if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { - int file = page_is_file_cache(page); + int file = page_is_file_lru(page); int lru = page_lru_base_type(page); del_page_from_lru_list(page, lruvec, lru); @@ -394,7 +394,7 @@ void mark_page_accessed(struct page *page) else __lru_cache_activate_page(page); ClearPageReferenced(page); - if (page_is_file_cache(page)) + if (page_is_file_lru(page)) workingset_activation(page); } if (page_is_idle(page)) @@ -515,7 +515,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, return; active = PageActive(page); - file = page_is_file_cache(page); + file = page_is_file_lru(page); lru = page_lru_base_type(page); del_page_from_lru_list(page, lruvec, lru + active); @@ -548,7 +548,7 @@ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, void *arg) { if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { - int file = page_is_file_cache(page); + int file = page_is_file_lru(page); int lru = page_lru_base_type(page); del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); @@ -573,9 +573,9 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, ClearPageActive(page); ClearPageReferenced(page); /* - * lazyfree pages are clean anonymous pages. They have - * SwapBacked flag cleared to distinguish normal anonymous - * pages + * Lazyfree pages are clean anonymous pages. They have + * PG_swapbacked flag cleared, to distinguish them from normal + * anonymous pages */ ClearPageSwapBacked(page); add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE); @@ -962,7 +962,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, if (page_evictable(page)) { lru = page_lru(page); - update_page_reclaim_stat(lruvec, page_is_file_cache(page), + update_page_reclaim_stat(lruvec, page_is_file_lru(page), PageActive(page)); if (was_unevictable) count_vm_event(UNEVICTABLE_PGRESCUED); @@ -1004,6 +1004,10 @@ void __pagevec_lru_add(struct pagevec *pvec) * ascending indexes. There may be holes in the indices due to * not-present entries. * + * Only one subpage of a Transparent Huge Page is returned in one call: + * allowing truncate_inode_pages_range() to evict the whole THP without + * cycling through a pagevec of extra references. + * * pagevec_lookup_entries() returns the number of entries which were * found. */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 273a923c275c964adc37481ba3e918ee3ab0a80f..5871a2aa86a57d4a43d2edb8d76fa083265cfde0 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2797,6 +2797,7 @@ static int swaps_open(struct inode *inode, struct file *file) } static const struct proc_ops swaps_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = swaps_open, .proc_read = seq_read, .proc_lseek = seq_lseek, diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index bd96855f39614156fdbd4fa54ab72a2e26e7055f..512576e171ce56791444965937f228f6ef7192b5 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -53,7 +53,8 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, - struct page **pagep) + struct page **pagep, + bool wp_copy) { struct mem_cgroup *memcg; pte_t _dst_pte, *dst_pte; @@ -99,9 +100,13 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false)) goto out_release; - _dst_pte = mk_pte(page, dst_vma->vm_page_prot); - if (dst_vma->vm_flags & VM_WRITE) - _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); + _dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot)); + if (dst_vma->vm_flags & VM_WRITE) { + if (wp_copy) + _dst_pte = pte_mkuffd_wp(_dst_pte); + else + _dst_pte = pte_mkwrite(_dst_pte); + } dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); if (dst_vma->vm_file) { @@ -415,7 +420,8 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, unsigned long dst_addr, unsigned long src_addr, struct page **page, - bool zeropage) + bool zeropage, + bool wp_copy) { ssize_t err; @@ -432,11 +438,13 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, if (!(dst_vma->vm_flags & VM_SHARED)) { if (!zeropage) err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, - dst_addr, src_addr, page); + dst_addr, src_addr, page, + wp_copy); else err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, dst_addr); } else { + VM_WARN_ON_ONCE(wp_copy); if (!zeropage) err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, @@ -454,7 +462,8 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, unsigned long src_start, unsigned long len, bool zeropage, - bool *mmap_changing) + bool *mmap_changing, + __u64 mode) { struct vm_area_struct *dst_vma; ssize_t err; @@ -462,6 +471,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, unsigned long src_addr, dst_addr; long copied; struct page *page; + bool wp_copy; /* * Sanitize the command parameters: @@ -507,6 +517,14 @@ retry: dst_vma->vm_flags & VM_SHARED)) goto out_unlock; + /* + * validate 'mode' now that we know the dst_vma: don't allow + * a wrprotect copy if the userfaultfd didn't register as WP. + */ + wp_copy = mode & UFFDIO_COPY_MODE_WP; + if (wp_copy && !(dst_vma->vm_flags & VM_UFFD_WP)) + goto out_unlock; + /* * If this is a HUGETLB vma, pass off to appropriate routine */ @@ -562,7 +580,7 @@ retry: BUG_ON(pmd_trans_huge(*dst_pmd)); err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, - src_addr, &page, zeropage); + src_addr, &page, zeropage, wp_copy); cond_resched(); if (unlikely(err == -ENOENT)) { @@ -609,14 +627,68 @@ out: ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - bool *mmap_changing) + bool *mmap_changing, __u64 mode) { return __mcopy_atomic(dst_mm, dst_start, src_start, len, false, - mmap_changing); + mmap_changing, mode); } ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool *mmap_changing) { - return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing); + return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing, 0); +} + +int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, + unsigned long len, bool enable_wp, bool *mmap_changing) +{ + struct vm_area_struct *dst_vma; + pgprot_t newprot; + int err; + + /* + * Sanitize the command parameters: + */ + BUG_ON(start & ~PAGE_MASK); + BUG_ON(len & ~PAGE_MASK); + + /* Does the address range wrap, or is the span zero-sized? */ + BUG_ON(start + len <= start); + + down_read(&dst_mm->mmap_sem); + + /* + * If memory mappings are changing because of non-cooperative + * operation (e.g. mremap) running in parallel, bail out and + * request the user to retry later + */ + err = -EAGAIN; + if (mmap_changing && READ_ONCE(*mmap_changing)) + goto out_unlock; + + err = -ENOENT; + dst_vma = find_dst_vma(dst_mm, start, len); + /* + * Make sure the vma is not shared, that the dst range is + * both valid and fully within a single existing vma. + */ + if (!dst_vma || (dst_vma->vm_flags & VM_SHARED)) + goto out_unlock; + if (!userfaultfd_wp(dst_vma)) + goto out_unlock; + if (!vma_is_anonymous(dst_vma)) + goto out_unlock; + + if (enable_wp) + newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE)); + else + newprot = vm_get_page_prot(dst_vma->vm_flags); + + change_protection(dst_vma, start, start + len, newprot, + enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE); + + err = 0; +out_unlock: + up_read(&dst_mm->mmap_sem); + return err; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6b8eeb0ecee518a46c1a62c8b884ec9be56e89b7..399f219544f74eb45ba2766250b45b9cdbeb971a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3368,7 +3368,7 @@ retry: goto overflow; /* - * If required width exeeds current VA block, move + * If required width exceeds current VA block, move * base downwards and then recheck. */ if (base + end > va->va_end) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 2e8e690d28131504ef3c6ad7c51d80dd3d60ddd9..b06868fc492659a7038eed45a420d3dec391f5d0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -919,7 +919,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * exceptional entries and shadow exceptional entries in the * same address_space. */ - if (reclaimed && page_is_file_cache(page) && + if (reclaimed && page_is_file_lru(page) && !mapping_exiting(mapping) && !dax_mapping(mapping)) shadow = workingset_eviction(page, target_memcg); __delete_from_page_cache(page, shadow); @@ -1043,7 +1043,7 @@ static void page_check_dirty_writeback(struct page *page, * Anonymous pages are not handled by flushers and must be written * from reclaim context. Do not stall reclaim based on them */ - if (!page_is_file_cache(page) || + if (!page_is_file_lru(page) || (PageAnon(page) && !PageSwapBacked(page))) { *dirty = false; *writeback = false; @@ -1315,7 +1315,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * the rest of the LRU for clean pages and see * the same dirty pages again (PageReclaim). */ - if (page_is_file_cache(page) && + if (page_is_file_lru(page) && (!current_is_kswapd() || !PageReclaim(page) || !test_bit(PGDAT_DIRTY, &pgdat->flags))) { /* @@ -1459,7 +1459,7 @@ activate_locked: try_to_free_swap(page); VM_BUG_ON_PAGE(PageActive(page), page); if (!PageMlocked(page)) { - int type = page_is_file_cache(page); + int type = page_is_file_lru(page); SetPageActive(page); stat->nr_activate[type] += nr_pages; count_memcg_page_event(page, PGACTIVATE); @@ -1497,7 +1497,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, LIST_HEAD(clean_pages); list_for_each_entry_safe(page, next, page_list, lru) { - if (page_is_file_cache(page) && !PageDirty(page) && + if (page_is_file_lru(page) && !PageDirty(page) && !__PageMovable(page) && !PageUnevictable(page)) { ClearPageActive(page); list_move(&page->lru, &clean_pages); @@ -2053,7 +2053,7 @@ static void shrink_active_list(unsigned long nr_to_scan, * IO, plus JVM can create lots of anon VM_EXEC pages, * so we ignore them here. */ - if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) { + if ((vm_flags & VM_EXEC) && page_is_file_lru(page)) { list_add(&page->lru, &l_active); continue; } diff --git a/mm/vmstat.c b/mm/vmstat.c index c9c0d71f917ff14bc3d83de951a8b8e2a71606c6..96d21a792b57c35ad59f1f062b7efedf9f59a34c 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1256,9 +1256,12 @@ const char * const vmstat_text[] = { #ifdef CONFIG_TRANSPARENT_HUGEPAGE "thp_fault_alloc", "thp_fault_fallback", + "thp_fault_fallback_charge", "thp_collapse_alloc", "thp_collapse_alloc_failed", "thp_file_alloc", + "thp_file_fallback", + "thp_file_fallback_charge", "thp_file_mapped", "thp_split_page", "thp_split_page_failed", diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 22d17ecfe7df464412426e2dc4079c541f28ae4d..2f836a2b993f0cf16a3cf2d6c28e8a52d30ba604 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -424,7 +424,7 @@ static void *zs_zpool_map(void *pool, unsigned long handle, case ZPOOL_MM_WO: zs_mm = ZS_MM_WO; break; - case ZPOOL_MM_RW: /* fall through */ + case ZPOOL_MM_RW: default: zs_mm = ZS_MM_RW; break; @@ -891,12 +891,12 @@ static inline int trypin_tag(unsigned long handle) return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle); } -static void pin_tag(unsigned long handle) +static void pin_tag(unsigned long handle) __acquires(bitlock) { bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle); } -static void unpin_tag(unsigned long handle) +static void unpin_tag(unsigned long handle) __releases(bitlock) { bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle); } @@ -1833,12 +1833,12 @@ static void migrate_lock_init(struct zspage *zspage) rwlock_init(&zspage->lock); } -static void migrate_read_lock(struct zspage *zspage) +static void migrate_read_lock(struct zspage *zspage) __acquires(&zspage->lock) { read_lock(&zspage->lock); } -static void migrate_read_unlock(struct zspage *zspage) +static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock) { read_unlock(&zspage->lock); } diff --git a/mm/zswap.c b/mm/zswap.c index 55094e63b72d9468e57facb616907de5247f71a2..fbb782924ccc5e4c18fea6c0c2e747812ca0d5e0 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -77,8 +77,8 @@ static bool zswap_pool_reached_full; #define ZSWAP_PARAM_UNSET "" -/* Enable/disable zswap (disabled by default) */ -static bool zswap_enabled; +/* Enable/disable zswap */ +static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON); static int zswap_enabled_param_set(const char *, const struct kernel_param *); static struct kernel_param_ops zswap_enabled_param_ops = { @@ -88,8 +88,7 @@ static struct kernel_param_ops zswap_enabled_param_ops = { module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644); /* Crypto compressor to use */ -#define ZSWAP_COMPRESSOR_DEFAULT "lzo" -static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; +static char *zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; static int zswap_compressor_param_set(const char *, const struct kernel_param *); static struct kernel_param_ops zswap_compressor_param_ops = { @@ -101,8 +100,7 @@ module_param_cb(compressor, &zswap_compressor_param_ops, &zswap_compressor, 0644); /* Compressed storage zpool to use */ -#define ZSWAP_ZPOOL_DEFAULT "zbud" -static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; +static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; static int zswap_zpool_param_set(const char *, const struct kernel_param *); static struct kernel_param_ops zswap_zpool_param_ops = { .set = zswap_zpool_param_set, @@ -599,11 +597,12 @@ static __init struct zswap_pool *__zswap_pool_create_fallback(void) bool has_comp, has_zpool; has_comp = crypto_has_comp(zswap_compressor, 0, 0); - if (!has_comp && strcmp(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT)) { + if (!has_comp && strcmp(zswap_compressor, + CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) { pr_err("compressor %s not available, using default %s\n", - zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT); + zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT); param_free_charp(&zswap_compressor); - zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; + zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; has_comp = crypto_has_comp(zswap_compressor, 0, 0); } if (!has_comp) { @@ -614,11 +613,12 @@ static __init struct zswap_pool *__zswap_pool_create_fallback(void) } has_zpool = zpool_has_pool(zswap_zpool_type); - if (!has_zpool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { + if (!has_zpool && strcmp(zswap_zpool_type, + CONFIG_ZSWAP_ZPOOL_DEFAULT)) { pr_err("zpool %s not available, using default %s\n", - zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT); + zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT); param_free_charp(&zswap_zpool_type); - zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; + zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; has_zpool = zpool_has_pool(zswap_zpool_type); } if (!has_zpool) { diff --git a/samples/hw_breakpoint/data_breakpoint.c b/samples/hw_breakpoint/data_breakpoint.c index c58504774118ba3c50e88e2fb915e11d58acebfd..418c46fe5ffc3974c660f85008a864b4edc5da59 100644 --- a/samples/hw_breakpoint/data_breakpoint.c +++ b/samples/hw_breakpoint/data_breakpoint.c @@ -23,7 +23,7 @@ struct perf_event * __percpu *sample_hbp; -static char ksym_name[KSYM_NAME_LEN] = "pid_max"; +static char ksym_name[KSYM_NAME_LEN] = "jiffies"; module_param_string(ksym, ksym_name, KSYM_NAME_LEN, S_IRUGO); MODULE_PARM_DESC(ksym, "Kernel symbol to monitor; this module will report any" " write operations on the kernel symbol"); @@ -41,11 +41,15 @@ static int __init hw_break_module_init(void) { int ret; struct perf_event_attr attr; + void *addr = __symbol_get(ksym_name); + + if (!addr) + return -ENXIO; hw_breakpoint_init(&attr); - attr.bp_addr = kallsyms_lookup_name(ksym_name); + attr.bp_addr = (unsigned long)addr; attr.bp_len = HW_BREAKPOINT_LEN_4; - attr.bp_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R; + attr.bp_type = HW_BREAKPOINT_W; sample_hbp = register_wide_hw_breakpoint(&attr, sample_hbp_handler, NULL); if (IS_ERR((void __force *)sample_hbp)) { @@ -66,6 +70,7 @@ fail: static void __exit hw_break_module_exit(void) { unregister_wide_hw_breakpoint(sample_hbp); + symbol_put(ksym_name); printk(KERN_INFO "HW Breakpoint for %s write uninstalled\n", ksym_name); } diff --git a/scripts/Makefile.ubsan b/scripts/Makefile.ubsan index 019771b845c5ff15727bcd23975c3ac2694ca80f..5b15bc425ec9872dba1f59e9c827699d7dd12b67 100644 --- a/scripts/Makefile.ubsan +++ b/scripts/Makefile.ubsan @@ -1,16 +1,26 @@ # SPDX-License-Identifier: GPL-2.0 ifdef CONFIG_UBSAN + +ifdef CONFIG_UBSAN_ALIGNMENT + CFLAGS_UBSAN += $(call cc-option, -fsanitize=alignment) +endif + +ifdef CONFIG_UBSAN_BOUNDS + CFLAGS_UBSAN += $(call cc-option, -fsanitize=bounds) +endif + +ifdef CONFIG_UBSAN_MISC CFLAGS_UBSAN += $(call cc-option, -fsanitize=shift) CFLAGS_UBSAN += $(call cc-option, -fsanitize=integer-divide-by-zero) CFLAGS_UBSAN += $(call cc-option, -fsanitize=unreachable) CFLAGS_UBSAN += $(call cc-option, -fsanitize=signed-integer-overflow) - CFLAGS_UBSAN += $(call cc-option, -fsanitize=bounds) CFLAGS_UBSAN += $(call cc-option, -fsanitize=object-size) CFLAGS_UBSAN += $(call cc-option, -fsanitize=bool) CFLAGS_UBSAN += $(call cc-option, -fsanitize=enum) +endif -ifdef CONFIG_UBSAN_ALIGNMENT - CFLAGS_UBSAN += $(call cc-option, -fsanitize=alignment) +ifdef CONFIG_UBSAN_TRAP + CFLAGS_UBSAN += $(call cc-option, -fsanitize-undefined-trap-on-error) endif # -fsanitize=* options makes GCC less smart than usual and diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index a63380c6b0d20f8f2fd9c73b715343177f607bb7..d64c67b67e3c7b76073798f2959ee5ef41206f1b 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -64,6 +64,7 @@ my $color = "auto"; my $allow_c99_comments = 1; # Can be overridden by --ignore C99_COMMENT_TOLERANCE # git output parsing needs US English output, so first set backtick child process LANGUAGE my $git_command ='export LANGUAGE=en_US.UTF-8; git'; +my $tabsize = 8; sub help { my ($exitcode) = @_; @@ -98,6 +99,7 @@ Options: --show-types show the specific message type in the output --max-line-length=n set the maximum line length, if exceeded, warn --min-conf-desc-length=n set the min description length, if shorter, warn + --tab-size=n set the number of spaces for tab (default 8) --root=PATH PATH to the kernel tree root --no-summary suppress the per-file summary --mailback only produce a report in case of warnings/errors @@ -215,6 +217,7 @@ GetOptions( 'list-types!' => \$list_types, 'max-line-length=i' => \$max_line_length, 'min-conf-desc-length=i' => \$min_conf_desc_length, + 'tab-size=i' => \$tabsize, 'root=s' => \$root, 'summary!' => \$summary, 'mailback!' => \$mailback, @@ -267,6 +270,9 @@ if ($color =~ /^[01]$/) { die "Invalid color mode: $color\n"; } +# skip TAB size 1 to avoid additional checks on $tabsize - 1 +die "Invalid TAB size: $tabsize\n" if ($tabsize < 2); + sub hash_save_array_words { my ($hashRef, $arrayRef) = @_; @@ -804,12 +810,12 @@ sub build_types { }x; $Type = qr{ $NonptrType - (?:(?:\s|\*|\[\])+\s*const|(?:\s|\*\s*(?:const\s*)?|\[\])+|(?:\s*\[\s*\])+)? + (?:(?:\s|\*|\[\])+\s*const|(?:\s|\*\s*(?:const\s*)?|\[\])+|(?:\s*\[\s*\])+){0,4} (?:\s+$Inline|\s+$Modifier)* }x; $TypeMisordered = qr{ $NonptrTypeMisordered - (?:(?:\s|\*|\[\])+\s*const|(?:\s|\*\s*(?:const\s*)?|\[\])+|(?:\s*\[\s*\])+)? + (?:(?:\s|\*|\[\])+\s*const|(?:\s|\*\s*(?:const\s*)?|\[\])+|(?:\s*\[\s*\])+){0,4} (?:\s+$Inline|\s+$Modifier)* }x; $Declare = qr{(?:$Storage\s+(?:$Inline\s+)?)?$Type}; @@ -1118,6 +1124,7 @@ sub parse_email { my ($formatted_email) = @_; my $name = ""; + my $name_comment = ""; my $address = ""; my $comment = ""; @@ -1150,6 +1157,10 @@ sub parse_email { $name = trim($name); $name =~ s/^\"|\"$//g; + $name =~ s/(\s*\([^\)]+\))\s*//; + if (defined($1)) { + $name_comment = trim($1); + } $address = trim($address); $address =~ s/^\<|\>$//g; @@ -1158,7 +1169,7 @@ sub parse_email { $name = "\"$name\""; } - return ($name, $address, $comment); + return ($name, $name_comment, $address, $comment); } sub format_email { @@ -1184,6 +1195,23 @@ sub format_email { return $formatted_email; } +sub reformat_email { + my ($email) = @_; + + my ($email_name, $name_comment, $email_address, $comment) = parse_email($email); + return format_email($email_name, $email_address); +} + +sub same_email_addresses { + my ($email1, $email2) = @_; + + my ($email1_name, $name1_comment, $email1_address, $comment1) = parse_email($email1); + my ($email2_name, $name2_comment, $email2_address, $comment2) = parse_email($email2); + + return $email1_name eq $email2_name && + $email1_address eq $email2_address; +} + sub which { my ($bin) = @_; @@ -1217,7 +1245,7 @@ sub expand_tabs { if ($c eq "\t") { $res .= ' '; $n++; - for (; ($n % 8) != 0; $n++) { + for (; ($n % $tabsize) != 0; $n++) { $res .= ' '; } next; @@ -2230,7 +2258,7 @@ sub string_find_replace { sub tabify { my ($leading) = @_; - my $source_indent = 8; + my $source_indent = $tabsize; my $max_spaces_before_tab = $source_indent - 1; my $spaces_to_tab = " " x $source_indent; @@ -2272,6 +2300,19 @@ sub pos_last_openparen { return length(expand_tabs(substr($line, 0, $last_openparen))) + 1; } +sub get_raw_comment { + my ($line, $rawline) = @_; + my $comment = ''; + + for my $i (0 .. (length($line) - 1)) { + if (substr($line, $i, 1) eq "$;") { + $comment .= substr($rawline, $i, 1); + } + } + + return $comment; +} + sub process { my $filename = shift; @@ -2294,6 +2335,7 @@ sub process { my $is_binding_patch = -1; my $in_header_lines = $file ? 0 : 1; my $in_commit_log = 0; #Scanning lines before patch + my $has_patch_separator = 0; #Found a --- line my $has_commit_log = 0; #Encountered lines before patch my $commit_log_lines = 0; #Number of commit log lines my $commit_log_possible_stack_dump = 0; @@ -2433,6 +2475,7 @@ sub process { $sline =~ s/$;/ /g; #with comments as spaces my $rawline = $rawlines[$linenr - 1]; + my $raw_comment = get_raw_comment($line, $rawline); # check if it's a mode change, rename or start of a patch if (!$in_commit_log && @@ -2604,21 +2647,26 @@ sub process { $author = $1; $author = encode("utf8", $author) if ($line =~ /=\?utf-8\?/i); $author =~ s/"//g; + $author = reformat_email($author); } # Check the patch for a signoff: - if ($line =~ /^\s*signed-off-by:/i) { + if ($line =~ /^\s*signed-off-by:\s*(.*)/i) { $signoff++; $in_commit_log = 0; if ($author ne '') { - my $l = $line; - $l =~ s/"//g; - if ($l =~ /^\s*signed-off-by:\s*\Q$author\E/i) { - $authorsignoff = 1; + if (same_email_addresses($1, $author)) { + $authorsignoff = 1; } } } +# Check for patch separator + if ($line =~ /^---$/) { + $has_patch_separator = 1; + $in_commit_log = 0; + } + # Check if MAINTAINERS is being updated. If so, there's probably no need to # emit the "does MAINTAINERS need updating?" message on file add/move/delete if ($line =~ /^\s*MAINTAINERS\s*\|/) { @@ -2664,7 +2712,7 @@ sub process { } } - my ($email_name, $email_address, $comment) = parse_email($email); + my ($email_name, $name_comment, $email_address, $comment) = parse_email($email); my $suggested_email = format_email(($email_name, $email_address)); if ($suggested_email eq "") { ERROR("BAD_SIGN_OFF", @@ -2675,9 +2723,7 @@ sub process { $dequoted =~ s/" </ </; # Don't force email to have quotes # Allow just an angle bracketed address - if ("$dequoted$comment" ne $email && - "<$email_address>$comment" ne $email && - "$suggested_email$comment" ne $email) { + if (!same_email_addresses($email, $suggested_email)) { WARN("BAD_SIGN_OFF", "email address '$email' might be better as '$suggested_email$comment'\n" . $herecurr); } @@ -2720,10 +2766,10 @@ sub process { "A patch subject line should describe the change not the tool that found it\n" . $herecurr); } -# Check for unwanted Gerrit info - if ($in_commit_log && $line =~ /^\s*change-id:/i) { +# Check for Gerrit Change-Ids not in any patch context + if ($realfile eq '' && !$has_patch_separator && $line =~ /^\s*change-id:/i) { ERROR("GERRIT_CHANGE_ID", - "Remove Gerrit Change-Id's before submitting upstream.\n" . $herecurr); + "Remove Gerrit Change-Id's before submitting upstream\n" . $herecurr); } # Check if the commit log is in a possible stack dump @@ -2761,7 +2807,7 @@ sub process { # Check for git id commit length and improperly formed commit descriptions if ($in_commit_log && !$commit_log_possible_stack_dump && - $line !~ /^\s*(?:Link|Patchwork|http|https|BugLink):/i && + $line !~ /^\s*(?:Link|Patchwork|http|https|BugLink|base-commit):/i && $line !~ /^This reverts commit [0-9a-f]{7,40}/ && ($line =~ /\bcommit\s+[0-9a-f]{5,}\b/i || ($line =~ /(?:\s|^)[0-9a-f]{12,40}(?:[\s"'\(\[]|$)/i && @@ -3087,7 +3133,7 @@ sub process { $comment = '/*'; } elsif ($realfile =~ /\.(c|dts|dtsi)$/) { $comment = '//'; - } elsif (($checklicenseline == 2) || $realfile =~ /\.(sh|pl|py|awk|tc)$/) { + } elsif (($checklicenseline == 2) || $realfile =~ /\.(sh|pl|py|awk|tc|yaml)$/) { $comment = '#'; } elsif ($realfile =~ /\.rst$/) { $comment = '..'; @@ -3111,6 +3157,17 @@ sub process { WARN("SPDX_LICENSE_TAG", "'$spdx_license' is not supported in LICENSES/...\n" . $herecurr); } + if ($realfile =~ m@^Documentation/devicetree/bindings/@ && + not $spdx_license =~ /GPL-2\.0.*BSD-2-Clause/) { + my $msg_level = \&WARN; + $msg_level = \&CHK if ($file); + if (&{$msg_level}("SPDX_LICENSE_TAG", + + "DT binding documents should be licensed (GPL-2.0-only OR BSD-2-Clause)\n" . $herecurr) && + $fix) { + $fixed[$fixlinenr] =~ s/SPDX-License-Identifier: .*/SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)/; + } + } } } } @@ -3198,7 +3255,7 @@ sub process { next if ($realfile !~ /\.(h|c|pl|dtsi|dts)$/); # at the beginning of a line any tabs must come first and anything -# more than 8 must use tabs. +# more than $tabsize must use tabs. if ($rawline =~ /^\+\s* \t\s*\S/ || $rawline =~ /^\+\s* \s*/) { my $herevet = "$here\n" . cat_vet($rawline) . "\n"; @@ -3217,7 +3274,7 @@ sub process { "please, no space before tabs\n" . $herevet) && $fix) { while ($fixed[$fixlinenr] =~ - s/(^\+.*) {8,8}\t/$1\t\t/) {} + s/(^\+.*) {$tabsize,$tabsize}\t/$1\t\t/) {} while ($fixed[$fixlinenr] =~ s/(^\+.*) +\t/$1\t/) {} } @@ -3239,11 +3296,11 @@ sub process { if ($perl_version_ok && $sline =~ /^\+\t+( +)(?:$c90_Keywords\b|\{\s*$|\}\s*(?:else\b|while\b|\s*$)|$Declare\s*$Ident\s*[;=])/) { my $indent = length($1); - if ($indent % 8) { + if ($indent % $tabsize) { if (WARN("TABSTOP", "Statements should start on a tabstop\n" . $herecurr) && $fix) { - $fixed[$fixlinenr] =~ s@(^\+\t+) +@$1 . "\t" x ($indent/8)@e; + $fixed[$fixlinenr] =~ s@(^\+\t+) +@$1 . "\t" x ($indent/$tabsize)@e; } } } @@ -3261,8 +3318,8 @@ sub process { my $newindent = $2; my $goodtabindent = $oldindent . - "\t" x ($pos / 8) . - " " x ($pos % 8); + "\t" x ($pos / $tabsize) . + " " x ($pos % $tabsize); my $goodspaceindent = $oldindent . " " x $pos; if ($newindent ne $goodtabindent && @@ -3733,11 +3790,11 @@ sub process { #print "line<$line> prevline<$prevline> indent<$indent> sindent<$sindent> check<$check> continuation<$continuation> s<$s> cond_lines<$cond_lines> stat_real<$stat_real> stat<$stat>\n"; if ($check && $s ne '' && - (($sindent % 8) != 0 || + (($sindent % $tabsize) != 0 || ($sindent < $indent) || ($sindent == $indent && ($s !~ /^\s*(?:\}|\{|else\b)/)) || - ($sindent > $indent + 8))) { + ($sindent > $indent + $tabsize))) { WARN("SUSPECT_CODE_INDENT", "suspect code indent for conditional statements ($indent, $sindent)\n" . $herecurr . "$stat_real\n"); } @@ -4014,7 +4071,7 @@ sub process { } # check for function declarations without arguments like "int foo()" - if ($line =~ /(\b$Type\s+$Ident)\s*\(\s*\)/) { + if ($line =~ /(\b$Type\s*$Ident)\s*\(\s*\)/) { if (ERROR("FUNCTION_WITHOUT_ARGS", "Bad function definition - $1() should probably be $1(void)\n" . $herecurr) && $fix) { @@ -4582,7 +4639,7 @@ sub process { ($op eq '>' && $ca =~ /<\S+\@\S+$/)) { - $ok = 1; + $ok = 1; } # for asm volatile statements @@ -4917,7 +4974,7 @@ sub process { # conditional. substr($s, 0, length($c), ''); $s =~ s/\n.*//g; - $s =~ s/$;//g; # Remove any comments + $s =~ s/$;//g; # Remove any comments if (length($c) && $s !~ /^\s*{?\s*\\*\s*$/ && $c !~ /}\s*while\s*/) { @@ -4956,7 +5013,7 @@ sub process { # if and else should not have general statements after it if ($line =~ /^.\s*(?:}\s*)?else\b(.*)/) { my $s = $1; - $s =~ s/$;//g; # Remove any comments + $s =~ s/$;//g; # Remove any comments if ($s !~ /^\s*(?:\sif|(?:{|)\s*\\?\s*$)/) { ERROR("TRAILING_STATEMENTS", "trailing statements should be on next line\n" . $herecurr); @@ -5132,7 +5189,7 @@ sub process { { } - # Flatten any obvious string concatentation. + # Flatten any obvious string concatenation. while ($dstat =~ s/($String)\s*$Ident/$1/ || $dstat =~ s/$Ident\s*($String)/$1/) { @@ -6230,13 +6287,17 @@ sub process { } # check for function declarations that have arguments without identifier names +# while avoiding uninitialized_var(x) if (defined $stat && - $stat =~ /^.\s*(?:extern\s+)?$Type\s*(?:$Ident|\(\s*\*\s*$Ident\s*\))\s*\(\s*([^{]+)\s*\)\s*;/s && - $1 ne "void") { - my $args = trim($1); + $stat =~ /^.\s*(?:extern\s+)?$Type\s*(?:($Ident)|\(\s*\*\s*$Ident\s*\))\s*\(\s*([^{]+)\s*\)\s*;/s && + (!defined($1) || + (defined($1) && $1 ne "uninitialized_var")) && + $2 ne "void") { + my $args = trim($2); while ($args =~ m/\s*($Type\s*(?:$Ident|\(\s*\*\s*$Ident?\s*\)\s*$balanced_parens)?)/g) { my $arg = trim($1); - if ($arg =~ /^$Type$/ && $arg !~ /enum\s+$Ident$/) { + if ($arg =~ /^$Type$/ && + $arg !~ /enum\s+$Ident$/) { WARN("FUNCTION_ARGUMENTS", "function definition argument '$arg' should also have an identifier name\n" . $herecurr); } @@ -6389,6 +6450,28 @@ sub process { } } +# check for /* fallthrough */ like comment, prefer fallthrough; + my @fallthroughs = ( + 'fallthrough', + '@fallthrough@', + 'lint -fallthrough[ \t]*', + 'intentional(?:ly)?[ \t]*fall(?:(?:s | |-)[Tt]|t)hr(?:ough|u|ew)', + '(?:else,?\s*)?FALL(?:S | |-)?THR(?:OUGH|U|EW)[ \t.!]*(?:-[^\n\r]*)?', + 'Fall(?:(?:s | |-)[Tt]|t)hr(?:ough|u|ew)[ \t.!]*(?:-[^\n\r]*)?', + 'fall(?:s | |-)?thr(?:ough|u|ew)[ \t.!]*(?:-[^\n\r]*)?', + ); + if ($raw_comment ne '') { + foreach my $ft (@fallthroughs) { + if ($raw_comment =~ /$ft/) { + my $msg_level = \&WARN; + $msg_level = \&CHK if ($file); + &{$msg_level}("PREFER_FALLTHROUGH", + "Prefer 'fallthrough;' over fallthrough comment\n" . $herecurr); + last; + } + } + } + # check for switch/default statements without a break; if ($perl_version_ok && defined $stat && diff --git a/tools/lib/rbtree.c b/tools/lib/rbtree.c index 2548ff8c4d9c1c75e1ad59baa346d9f5cf205b50..06ac7bd2144b77c5716cbebddd52cf22d98780e6 100644 --- a/tools/lib/rbtree.c +++ b/tools/lib/rbtree.c @@ -497,7 +497,7 @@ struct rb_node *rb_next(const struct rb_node *node) if (node->rb_right) { node = node->rb_right; while (node->rb_left) - node=node->rb_left; + node = node->rb_left; return (struct rb_node *)node; } @@ -528,7 +528,7 @@ struct rb_node *rb_prev(const struct rb_node *node) if (node->rb_left) { node = node->rb_left; while (node->rb_right) - node=node->rb_right; + node = node->rb_right; return (struct rb_node *)node; } diff --git a/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c b/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c index 37a04dab56f0a90b1ff76e102ce1f090f8efaa18..11eee0b600407e2f83a036ae8d422acf0c744a56 100644 --- a/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c +++ b/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c @@ -7,13 +7,14 @@ #include <pthread.h> #include <sys/epoll.h> #include <sys/socket.h> +#include <sys/eventfd.h> #include "../../kselftest_harness.h" struct epoll_mtcontext { int efd[3]; int sfd[4]; - int count; + volatile int count; pthread_t main; pthread_t waiter; @@ -3071,4 +3072,68 @@ TEST(epoll58) close(ctx.sfd[3]); } +static void *epoll59_thread(void *ctx_) +{ + struct epoll_mtcontext *ctx = ctx_; + struct epoll_event e; + int i; + + for (i = 0; i < 100000; i++) { + while (ctx->count == 0) + ; + + e.events = EPOLLIN | EPOLLERR | EPOLLET; + epoll_ctl(ctx->efd[0], EPOLL_CTL_MOD, ctx->sfd[0], &e); + ctx->count = 0; + } + + return NULL; +} + +/* + * t0 + * (p) \ + * e0 + * (et) / + * e0 + * + * Based on https://bugzilla.kernel.org/show_bug.cgi?id=205933 + */ +TEST(epoll59) +{ + pthread_t emitter; + struct pollfd pfd; + struct epoll_event e; + struct epoll_mtcontext ctx = { 0 }; + int i, ret; + + signal(SIGUSR1, signal_handler); + + ctx.efd[0] = epoll_create1(0); + ASSERT_GE(ctx.efd[0], 0); + + ctx.sfd[0] = eventfd(1, 0); + ASSERT_GE(ctx.sfd[0], 0); + + e.events = EPOLLIN | EPOLLERR | EPOLLET; + ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0); + + ASSERT_EQ(pthread_create(&emitter, NULL, epoll59_thread, &ctx), 0); + + for (i = 0; i < 100000; i++) { + ret = epoll_wait(ctx.efd[0], &e, 1, 1000); + ASSERT_GT(ret, 0); + + while (ctx.count != 0) + ; + ctx.count = 1; + } + if (pthread_tryjoin_np(emitter, NULL) < 0) { + pthread_kill(emitter, SIGUSR1); + pthread_join(emitter, NULL); + } + close(ctx.efd[0]); + close(ctx.sfd[0]); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index d3362777a4258415010a9621544aa649fc1a9f89..61e5cfeb1350a5142748f028e0ece811cb31c9b1 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -54,6 +54,7 @@ #include <linux/userfaultfd.h> #include <setjmp.h> #include <stdbool.h> +#include <assert.h> #include "../kselftest.h" @@ -76,6 +77,8 @@ static int test_type; #define ALARM_INTERVAL_SECS 10 static volatile bool test_uffdio_copy_eexist = true; static volatile bool test_uffdio_zeropage_eexist = true; +/* Whether to test uffd write-protection */ +static bool test_uffdio_wp = false; static bool map_shared; static int huge_fd; @@ -86,6 +89,13 @@ static char *area_src, *area_src_alias, *area_dst, *area_dst_alias; static char *zeropage; pthread_attr_t attr; +/* Userfaultfd test statistics */ +struct uffd_stats { + int cpu; + unsigned long missing_faults; + unsigned long wp_faults; +}; + /* pthread_mutex_t starts at page offset 0 */ #define area_mutex(___area, ___nr) \ ((pthread_mutex_t *) ((___area) + (___nr)*page_size)) @@ -125,6 +135,37 @@ static void usage(void) exit(1); } +static void uffd_stats_reset(struct uffd_stats *uffd_stats, + unsigned long n_cpus) +{ + int i; + + for (i = 0; i < n_cpus; i++) { + uffd_stats[i].cpu = i; + uffd_stats[i].missing_faults = 0; + uffd_stats[i].wp_faults = 0; + } +} + +static void uffd_stats_report(struct uffd_stats *stats, int n_cpus) +{ + int i; + unsigned long long miss_total = 0, wp_total = 0; + + for (i = 0; i < n_cpus; i++) { + miss_total += stats[i].missing_faults; + wp_total += stats[i].wp_faults; + } + + printf("userfaults: %llu missing (", miss_total); + for (i = 0; i < n_cpus; i++) + printf("%lu+", stats[i].missing_faults); + printf("\b), %llu wp (", wp_total); + for (i = 0; i < n_cpus; i++) + printf("%lu+", stats[i].wp_faults); + printf("\b)\n"); +} + static int anon_release_pages(char *rel_area) { int ret = 0; @@ -245,10 +286,15 @@ struct uffd_test_ops { void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset); }; -#define ANON_EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \ +#define SHMEM_EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \ (1 << _UFFDIO_COPY) | \ (1 << _UFFDIO_ZEROPAGE)) +#define ANON_EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \ + (1 << _UFFDIO_COPY) | \ + (1 << _UFFDIO_ZEROPAGE) | \ + (1 << _UFFDIO_WRITEPROTECT)) + static struct uffd_test_ops anon_uffd_test_ops = { .expected_ioctls = ANON_EXPECTED_IOCTLS, .allocate_area = anon_allocate_area, @@ -257,7 +303,7 @@ static struct uffd_test_ops anon_uffd_test_ops = { }; static struct uffd_test_ops shmem_uffd_test_ops = { - .expected_ioctls = ANON_EXPECTED_IOCTLS, + .expected_ioctls = SHMEM_EXPECTED_IOCTLS, .allocate_area = shmem_allocate_area, .release_pages = shmem_release_pages, .alias_mapping = noop_alias_mapping, @@ -281,6 +327,21 @@ static int my_bcmp(char *str1, char *str2, size_t n) return 0; } +static void wp_range(int ufd, __u64 start, __u64 len, bool wp) +{ + struct uffdio_writeprotect prms = { 0 }; + + /* Write protection page faults */ + prms.range.start = start; + prms.range.len = len; + /* Undo write-protect, do wakeup after that */ + prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0; + + if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) + fprintf(stderr, "clear WP failed for address 0x%Lx\n", + start), exit(1); +} + static void *locking_thread(void *arg) { unsigned long cpu = (unsigned long) arg; @@ -419,7 +480,10 @@ static int __copy_page(int ufd, unsigned long offset, bool retry) uffdio_copy.dst = (unsigned long) area_dst + offset; uffdio_copy.src = (unsigned long) area_src + offset; uffdio_copy.len = page_size; - uffdio_copy.mode = 0; + if (test_uffdio_wp) + uffdio_copy.mode = UFFDIO_COPY_MODE_WP; + else + uffdio_copy.mode = 0; uffdio_copy.copy = 0; if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) { /* real retval in ufdio_copy.copy */ @@ -467,8 +531,8 @@ static int uffd_read_msg(int ufd, struct uffd_msg *msg) return 0; } -/* Return 1 if page fault handled by us; otherwise 0 */ -static int uffd_handle_page_fault(struct uffd_msg *msg) +static void uffd_handle_page_fault(struct uffd_msg *msg, + struct uffd_stats *stats) { unsigned long offset; @@ -476,25 +540,32 @@ static int uffd_handle_page_fault(struct uffd_msg *msg) fprintf(stderr, "unexpected msg event %u\n", msg->event), exit(1); - if (bounces & BOUNCE_VERIFY && - msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) - fprintf(stderr, "unexpected write fault\n"), exit(1); + if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) { + wp_range(uffd, msg->arg.pagefault.address, page_size, false); + stats->wp_faults++; + } else { + /* Missing page faults */ + if (bounces & BOUNCE_VERIFY && + msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) + fprintf(stderr, "unexpected write fault\n"), exit(1); - offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; - offset &= ~(page_size-1); + offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; + offset &= ~(page_size-1); - return copy_page(uffd, offset); + if (copy_page(uffd, offset)) + stats->missing_faults++; + } } static void *uffd_poll_thread(void *arg) { - unsigned long cpu = (unsigned long) arg; + struct uffd_stats *stats = (struct uffd_stats *)arg; + unsigned long cpu = stats->cpu; struct pollfd pollfd[2]; struct uffd_msg msg; struct uffdio_register uffd_reg; int ret; char tmp_chr; - unsigned long userfaults = 0; pollfd[0].fd = uffd; pollfd[0].events = POLLIN; @@ -524,7 +595,7 @@ static void *uffd_poll_thread(void *arg) msg.event), exit(1); break; case UFFD_EVENT_PAGEFAULT: - userfaults += uffd_handle_page_fault(&msg); + uffd_handle_page_fault(&msg, stats); break; case UFFD_EVENT_FORK: close(uffd); @@ -543,50 +614,67 @@ static void *uffd_poll_thread(void *arg) break; } } - return (void *)userfaults; + + return NULL; } pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER; static void *uffd_read_thread(void *arg) { - unsigned long *this_cpu_userfaults; + struct uffd_stats *stats = (struct uffd_stats *)arg; struct uffd_msg msg; - this_cpu_userfaults = (unsigned long *) arg; - *this_cpu_userfaults = 0; - pthread_mutex_unlock(&uffd_read_mutex); /* from here cancellation is ok */ for (;;) { if (uffd_read_msg(uffd, &msg)) continue; - (*this_cpu_userfaults) += uffd_handle_page_fault(&msg); + uffd_handle_page_fault(&msg, stats); } - return (void *)NULL; + + return NULL; } static void *background_thread(void *arg) { unsigned long cpu = (unsigned long) arg; - unsigned long page_nr; + unsigned long page_nr, start_nr, mid_nr, end_nr; + + start_nr = cpu * nr_pages_per_cpu; + end_nr = (cpu+1) * nr_pages_per_cpu; + mid_nr = (start_nr + end_nr) / 2; - for (page_nr = cpu * nr_pages_per_cpu; - page_nr < (cpu+1) * nr_pages_per_cpu; - page_nr++) + /* Copy the first half of the pages */ + for (page_nr = start_nr; page_nr < mid_nr; page_nr++) + copy_page_retry(uffd, page_nr * page_size); + + /* + * If we need to test uffd-wp, set it up now. Then we'll have + * at least the first half of the pages mapped already which + * can be write-protected for testing + */ + if (test_uffdio_wp) + wp_range(uffd, (unsigned long)area_dst + start_nr * page_size, + nr_pages_per_cpu * page_size, true); + + /* + * Continue the 2nd half of the page copying, handling write + * protection faults if any + */ + for (page_nr = mid_nr; page_nr < end_nr; page_nr++) copy_page_retry(uffd, page_nr * page_size); return NULL; } -static int stress(unsigned long *userfaults) +static int stress(struct uffd_stats *uffd_stats) { unsigned long cpu; pthread_t locking_threads[nr_cpus]; pthread_t uffd_threads[nr_cpus]; pthread_t background_threads[nr_cpus]; - void **_userfaults = (void **) userfaults; finished = 0; for (cpu = 0; cpu < nr_cpus; cpu++) { @@ -595,12 +683,13 @@ static int stress(unsigned long *userfaults) return 1; if (bounces & BOUNCE_POLL) { if (pthread_create(&uffd_threads[cpu], &attr, - uffd_poll_thread, (void *)cpu)) + uffd_poll_thread, + (void *)&uffd_stats[cpu])) return 1; } else { if (pthread_create(&uffd_threads[cpu], &attr, uffd_read_thread, - &_userfaults[cpu])) + (void *)&uffd_stats[cpu])) return 1; pthread_mutex_lock(&uffd_read_mutex); } @@ -637,7 +726,8 @@ static int stress(unsigned long *userfaults) fprintf(stderr, "pipefd write error\n"); return 1; } - if (pthread_join(uffd_threads[cpu], &_userfaults[cpu])) + if (pthread_join(uffd_threads[cpu], + (void *)&uffd_stats[cpu])) return 1; } else { if (pthread_cancel(uffd_threads[cpu])) @@ -735,17 +825,31 @@ static int faulting_process(int signal_test) } for (nr = 0; nr < split_nr_pages; nr++) { + int steps = 1; + unsigned long offset = nr * page_size; + if (signal_test) { if (sigsetjmp(*sigbuf, 1) != 0) { - if (nr == lastnr) { + if (steps == 1 && nr == lastnr) { fprintf(stderr, "Signal repeated\n"); return 1; } lastnr = nr; if (signal_test == 1) { - if (copy_page(uffd, nr * page_size)) - signalled++; + if (steps == 1) { + /* This is a MISSING request */ + steps++; + if (copy_page(uffd, offset)) + signalled++; + } else { + /* This is a WP request */ + assert(steps == 2); + wp_range(uffd, + (__u64)area_dst + + offset, + page_size, false); + } } else { signalled++; continue; @@ -758,8 +862,13 @@ static int faulting_process(int signal_test) fprintf(stderr, "nr %lu memory corruption %Lu %Lu\n", nr, count, - count_verify[nr]), exit(1); - } + count_verify[nr]); + } + /* + * Trigger write protection if there is by writting + * the same value back. + */ + *area_count(area_dst, nr) = count; } if (signal_test) @@ -781,6 +890,11 @@ static int faulting_process(int signal_test) nr, count, count_verify[nr]), exit(1); } + /* + * Trigger write protection if there is by writting + * the same value back. + */ + *area_count(area_dst, nr) = count; } if (uffd_test_ops->release_pages(area_dst)) @@ -884,6 +998,8 @@ static int userfaultfd_zeropage_test(void) uffdio_register.range.start = (unsigned long) area_dst; uffdio_register.range.len = nr_pages * page_size; uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (test_uffdio_wp) + uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) fprintf(stderr, "register failure\n"), exit(1); @@ -908,11 +1024,11 @@ static int userfaultfd_events_test(void) { struct uffdio_register uffdio_register; unsigned long expected_ioctls; - unsigned long userfaults; pthread_t uffd_mon; int err, features; pid_t pid; char c; + struct uffd_stats stats = { 0 }; printf("testing events (fork, remap, remove): "); fflush(stdout); @@ -929,6 +1045,8 @@ static int userfaultfd_events_test(void) uffdio_register.range.start = (unsigned long) area_dst; uffdio_register.range.len = nr_pages * page_size; uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (test_uffdio_wp) + uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) fprintf(stderr, "register failure\n"), exit(1); @@ -939,7 +1057,7 @@ static int userfaultfd_events_test(void) "unexpected missing ioctl for anon memory\n"), exit(1); - if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, NULL)) + if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) perror("uffd_poll_thread create"), exit(1); pid = fork(); @@ -955,13 +1073,14 @@ static int userfaultfd_events_test(void) if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) perror("pipe write"), exit(1); - if (pthread_join(uffd_mon, (void **)&userfaults)) + if (pthread_join(uffd_mon, NULL)) return 1; close(uffd); - printf("userfaults: %ld\n", userfaults); - return userfaults != nr_pages; + uffd_stats_report(&stats, 1); + + return stats.missing_faults != nr_pages; } static int userfaultfd_sig_test(void) @@ -973,6 +1092,7 @@ static int userfaultfd_sig_test(void) int err, features; pid_t pid; char c; + struct uffd_stats stats = { 0 }; printf("testing signal delivery: "); fflush(stdout); @@ -988,6 +1108,8 @@ static int userfaultfd_sig_test(void) uffdio_register.range.start = (unsigned long) area_dst; uffdio_register.range.len = nr_pages * page_size; uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (test_uffdio_wp) + uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) fprintf(stderr, "register failure\n"), exit(1); @@ -1004,7 +1126,7 @@ static int userfaultfd_sig_test(void) if (uffd_test_ops->release_pages(area_dst)) return 1; - if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, NULL)) + if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) perror("uffd_poll_thread create"), exit(1); pid = fork(); @@ -1030,6 +1152,7 @@ static int userfaultfd_sig_test(void) close(uffd); return userfaults != 0; } + static int userfaultfd_stress(void) { void *area; @@ -1038,7 +1161,7 @@ static int userfaultfd_stress(void) struct uffdio_register uffdio_register; unsigned long cpu; int err; - unsigned long userfaults[nr_cpus]; + struct uffd_stats uffd_stats[nr_cpus]; uffd_test_ops->allocate_area((void **)&area_src); if (!area_src) @@ -1119,6 +1242,8 @@ static int userfaultfd_stress(void) uffdio_register.range.start = (unsigned long) area_dst; uffdio_register.range.len = nr_pages * page_size; uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (test_uffdio_wp) + uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) { fprintf(stderr, "register failure\n"); return 1; @@ -1167,10 +1292,17 @@ static int userfaultfd_stress(void) if (uffd_test_ops->release_pages(area_dst)) return 1; + uffd_stats_reset(uffd_stats, nr_cpus); + /* bounce pass */ - if (stress(userfaults)) + if (stress(uffd_stats)) return 1; + /* Clear all the write protections if there is any */ + if (test_uffdio_wp) + wp_range(uffd, (unsigned long)area_dst, + nr_pages * page_size, false); + /* unregister */ if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) { fprintf(stderr, "unregister failure\n"); @@ -1209,10 +1341,7 @@ static int userfaultfd_stress(void) area_src_alias = area_dst_alias; area_dst_alias = tmp_area; - printf("userfaults:"); - for (cpu = 0; cpu < nr_cpus; cpu++) - printf(" %lu", userfaults[cpu]); - printf("\n"); + uffd_stats_report(uffd_stats, nr_cpus); } if (err) @@ -1252,6 +1381,8 @@ static void set_test_type(const char *type) if (!strcmp(type, "anon")) { test_type = TEST_ANON; uffd_test_ops = &anon_uffd_test_ops; + /* Only enable write-protect test for anonymous test */ + test_uffdio_wp = true; } else if (!strcmp(type, "hugetlb")) { test_type = TEST_HUGETLB; uffd_test_ops = &hugetlb_uffd_test_ops;